feat(session-async): add /api/score/session_async with incremental session report aggregation

- New POST /api/score/session_async endpoint: same session_id calls append to one shared report - New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records - New GET /api/score/session/jobs/{job_id}: individual call status - SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call - SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added - 24 new tests, all passing chore(weighted-score): comment out 综合加权得分 display and computation - report.js: hide 综合加权得分 card in report detail page - score_jobs.js: hide 综合 chip in async job list - report_builder.py: overall_ws=None (computation disabled) - summary.py: weighted_score summary line disabled - evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv - score.py /api/score: weighted_score always returns null - score_job_manager.py + session_score_manager.py: weighted=None - Updated 3 tests to match new behaviour (6 pre-existing failures unchanged) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00
parent e1751447df
commit 754a30ad59
36 changed files with 2004 additions and 51 deletions
--- a/webapp/services/pipeline_task_manager.py
+++ b/webapp/services/pipeline_task_manager.py
@@ -0,0 +1,257 @@
+"""Background task manager for end-to-end pipeline jobs (build + eval).
+
+Each job runs three sequential phases inside a worker thread:
+  1. parsing_documents  — AliyunDocmind parses every PDF
+  2. generating_questions — LLM generates a draft question bank
+  3. evaluating          — RAGAS online evaluation scores each question
+
+The DatasetBuildJob and Scenario objects are constructed entirely from the
+API request parameters, so no YAML config files are needed.
+"""
+
+from __future__ import annotations
+
+import io
+import threading
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import redirect_stderr, redirect_stdout
+from datetime import datetime, timezone
+from pathlib import Path
+
+from webapp.models import (
+    PipelineJobRequest,
+    PipelineJobStatus,
+    PipelineResult,
+)
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+class _LineCapture(io.TextIOBase):
+    """Write-only stream that appends complete lines to a task's log buffer."""
+
+    def __init__(self, sink: "PipelineTask") -> None:
+        self._sink = sink
+        self._buffer = ""
+
+    def write(self, text: str) -> int:
+        self._buffer += text
+        while "\n" in self._buffer:
+            line, self._buffer = self._buffer.split("\n", 1)
+            self._sink.append_log(line)
+        return len(text)
+
+    def flush(self) -> None:
+        if self._buffer:
+            self._sink.append_log(self._buffer)
+            self._buffer = ""
+
+
+class PipelineTask:
+    """Mutable state for one pipeline job (build + eval)."""
+
+    def __init__(self, job_id: str, job_name: str) -> None:
+        self.job_id = job_id
+        self.job_name = job_name
+        self.status = "queued"
+        self.phase = "idle"
+        self.logs: list[str] = []
+        self.result: PipelineResult | None = None
+        self.error: str | None = None
+        self.created_at = _now_iso()
+        self.finished_at = ""
+        self._lock = threading.Lock()
+
+    def append_log(self, line: str) -> None:
+        with self._lock:
+            self.logs.append(line)
+
+    def snapshot(self) -> PipelineJobStatus:
+        with self._lock:
+            return PipelineJobStatus(
+                job_id=self.job_id,
+                job_name=self.job_name,
+                status=self.status,
+                phase=self.phase,
+                logs=list(self.logs),
+                result=self.result,
+                error=self.error,
+                created_at=self.created_at,
+                finished_at=self.finished_at,
+            )
+
+
+class PipelineTaskManager:
+    """Owns the thread pool and registry of pipeline jobs."""
+
+    def __init__(self, max_workers: int = 2) -> None:
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._tasks: dict[str, PipelineTask] = {}
+        self._lock = threading.Lock()
+
+    def submit(self, request: PipelineJobRequest) -> PipelineTask:
+        """Register and schedule a new pipeline job; return its task object."""
+        job_id = uuid.uuid4().hex[:12]
+        job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"
+        task = PipelineTask(job_id=job_id, job_name=job_name)
+        with self._lock:
+            self._tasks[job_id] = task
+        self._executor.submit(self._run, task, request)
+        return task
+
+    def get(self, job_id: str) -> PipelineJobStatus | None:
+        with self._lock:
+            task = self._tasks.get(job_id)
+        return task.snapshot() if task is not None else None
+
+    def list_jobs(self) -> list[PipelineJobStatus]:
+        with self._lock:
+            tasks = list(self._tasks.values())
+        snapshots = [t.snapshot() for t in tasks]
+        snapshots.sort(key=lambda s: s.created_at, reverse=True)
+        return snapshots
+
+    # ------------------------------------------------------------------ #
+    # Worker
+    # ------------------------------------------------------------------ #
+
+    def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:
+        """Execute the full pipeline end to end inside a worker thread."""
+        task.status = "running"
+        task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")
+
+        capture = _LineCapture(task)
+        try:
+            with redirect_stdout(capture), redirect_stderr(capture):
+                result = self._execute(task, request)
+            capture.flush()
+            task.result = result
+            task.phase = "done"
+            task.status = "completed"
+            task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")
+        except Exception as exc:  # noqa: BLE001
+            capture.flush()
+            task.error = f"{type(exc).__name__}: {exc}"
+            task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")
+            task.status = "failed"
+        finally:
+            task.finished_at = _now_iso()
+
+    def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:
+        """Run build then eval, updating task.phase as we go."""
+
+        # ── resolve paths ──────────────────────────────────────────────
+        docs_path = Path(req.docs_path)
+        if not docs_path.is_absolute():
+            docs_path = (_REPO_ROOT / docs_path).resolve()
+        if not docs_path.is_dir():
+            raise ValueError(f"docs_path is not an existing directory: {docs_path}")
+
+        job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id
+        build_artifact_dir = job_output_dir / "build"
+        dataset_csv = job_output_dir / "generated_dataset.csv"
+        eval_output_dir = job_output_dir / "eval"
+
+        # ── phase 1 + 2: dataset build (parse & generate) ─────────────
+        task.phase = "parsing_documents"
+        task.append_log(f"  [build] 扫描文档目录: {docs_path}")
+        build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)
+
+        source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"
+        total_q = len(build_result.draft_samples)
+        parse_failures = len(build_result.parse_failures)
+        task.append_log(f"  [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")
+
+        if total_q == 0:
+            raise RuntimeError("题库为空（所有文档均解析或生成失败），中止评估。")
+
+        # ── phase 3: evaluation ────────────────────────────────────────
+        task.phase = "evaluating"
+        task.append_log(f"  [eval] 开始 RAGAS 评估，共 {total_q} 道题目")
+        eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)
+
+        from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths
+        eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)
+
+        return PipelineResult(
+            build_artifact_dir=build_artifact_dir.as_posix(),
+            dataset_csv=dataset_csv.as_posix(),
+            source_chunks_jsonl=source_chunks_jsonl.as_posix(),
+            total_questions=total_q,
+            parse_failures=parse_failures,
+            eval_run_id=eval_result.run_id,
+            eval_output_dir=eval_result.scenario.output_dir.as_posix(),
+            scores_csv=eval_artifact_paths.scores_csv.as_posix(),
+            summary_md=eval_artifact_paths.summary_md.as_posix(),
+        )
+
+    def _run_build(self, task: PipelineTask, req: PipelineJobRequest,
+                   docs_path: Path, artifact_dir: Path, dataset_csv: Path):
+        """Construct DatasetBuildJob and run the build phase."""
+        from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
+        from rag_eval.dataset_builder.runner import execute_dataset_build_job
+        from rag_eval.settings import EvaluationSettings
+
+        settings = EvaluationSettings()
+        job = DatasetBuildJob(
+            job_name=task.job_name,
+            input_path=docs_path,
+            input_glob="*.pdf",
+            parser_provider="aliyun_docmind",
+            failure_mode=req.failure_mode,  # type: ignore[arg-type]
+            generation_model=req.generation_model,
+            output_type="online_question_bank",
+            review_mode="draft_with_manual_review",
+            max_questions_per_document=req.max_questions_per_document,
+            max_source_chunks_per_question=req.max_source_chunks_per_question,
+            dataset_path=dataset_csv,
+            artifact_dir=artifact_dir,
+            runtime=DatasetBuildRuntime(max_documents=req.max_documents),
+        )
+        return execute_dataset_build_job(job, settings=settings)
+
+    def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,
+                  dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):
+        """Construct Scenario and run the evaluation phase."""
+        from rag_eval.execution.runner import run_scenario_from_scenario_obj
+        from rag_eval.settings import EvaluationSettings
+        from rag_eval.shared.models import (
+            AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,
+        )
+
+        settings = EvaluationSettings()
+        scenario = Scenario(
+            scenario_name=task.job_name,
+            mode="online",
+            dataset=DatasetConfig(path=dataset_csv),
+            judge_model=req.judge_model,
+            embedding_model=req.embedding_model,
+            metrics=list(req.metrics),
+            output_dir=eval_output_dir,
+            runtime=RuntimeConfig(
+                batch_size=4,
+                app_concurrency=2,
+                metric_concurrency=2,
+                max_samples=req.max_samples,
+            ),
+            app_adapter=AppAdapterConfig(
+                type="python",
+                callable="apps.siemens_pdf_qa.adapter:run",
+                static_kwargs={
+                    "source_chunks_path": source_chunks_jsonl,
+                    "model": req.answer_model,
+                },
+            ),
+            optimization_advisor=req.optimization_advisor,
+        )
+        return run_scenario_from_scenario_obj(scenario, settings=settings)
+
+
+# Module-level singleton shared by the FastAPI routes.
+pipeline_task_manager = PipelineTaskManager()