siemens_ragas/webapp/services/pipeline_task_manager.py

"""Background task manager for end-to-end pipeline jobs (build + eval).

Each job runs three sequential phases inside a worker thread:
  1. parsing_documents  — AliyunDocmind parses every PDF
  2. generating_questions — LLM generates a draft question bank
  3. evaluating          — RAGAS online evaluation scores each question

The DatasetBuildJob and Scenario objects are constructed entirely from the
API request parameters, so no YAML config files are needed.
"""

from __future__ import annotations

import io
import threading
import uuid
from concurrent.futures import ThreadPoolExecutor
from contextlib import redirect_stderr, redirect_stdout
from datetime import datetime, timezone
from pathlib import Path

from webapp.models import (
    PipelineJobRequest,
    PipelineJobStatus,
    PipelineResult,
)

_REPO_ROOT = Path(__file__).resolve().parents[2]
_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"


def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


class _LineCapture(io.TextIOBase):
    """Write-only stream that appends complete lines to a task's log buffer."""

    def __init__(self, sink: "PipelineTask") -> None:
        self._sink = sink
        self._buffer = ""

    def write(self, text: str) -> int:
        self._buffer += text
        while "\n" in self._buffer:
            line, self._buffer = self._buffer.split("\n", 1)
            self._sink.append_log(line)
        return len(text)

    def flush(self) -> None:
        if self._buffer:
            self._sink.append_log(self._buffer)
            self._buffer = ""


class PipelineTask:
    """Mutable state for one pipeline job (build + eval)."""

    def __init__(self, job_id: str, job_name: str) -> None:
        self.job_id = job_id
        self.job_name = job_name
        self.status = "queued"
        self.phase = "idle"
        self.logs: list[str] = []
        self.result: PipelineResult | None = None
        self.error: str | None = None
        self.created_at = _now_iso()
        self.finished_at = ""
        self._lock = threading.Lock()

    def append_log(self, line: str) -> None:
        with self._lock:
            self.logs.append(line)

    def snapshot(self) -> PipelineJobStatus:
        with self._lock:
            return PipelineJobStatus(
                job_id=self.job_id,
                job_name=self.job_name,
                status=self.status,
                phase=self.phase,
                logs=list(self.logs),
                result=self.result,
                error=self.error,
                created_at=self.created_at,
                finished_at=self.finished_at,
            )


class PipelineTaskManager:
    """Owns the thread pool and registry of pipeline jobs."""

    def __init__(self, max_workers: int = 2) -> None:
        self._executor = ThreadPoolExecutor(max_workers=max_workers)
        self._tasks: dict[str, PipelineTask] = {}
        self._lock = threading.Lock()

    def submit(self, request: PipelineJobRequest) -> PipelineTask:
        """Register and schedule a new pipeline job; return its task object."""
        job_id = uuid.uuid4().hex[:12]
        job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"
        task = PipelineTask(job_id=job_id, job_name=job_name)
        with self._lock:
            self._tasks[job_id] = task
        self._executor.submit(self._run, task, request)
        return task

    def get(self, job_id: str) -> PipelineJobStatus | None:
        with self._lock:
            task = self._tasks.get(job_id)
        return task.snapshot() if task is not None else None

    def list_jobs(self) -> list[PipelineJobStatus]:
        with self._lock:
            tasks = list(self._tasks.values())
        snapshots = [t.snapshot() for t in tasks]
        snapshots.sort(key=lambda s: s.created_at, reverse=True)
        return snapshots

    # ------------------------------------------------------------------ #
    # Worker
    # ------------------------------------------------------------------ #

    def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:
        """Execute the full pipeline end to end inside a worker thread."""
        task.status = "running"
        task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")

        capture = _LineCapture(task)
        try:
            with redirect_stdout(capture), redirect_stderr(capture):
                result = self._execute(task, request)
            capture.flush()
            task.result = result
            task.phase = "done"
            task.status = "completed"
            task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")
        except Exception as exc:  # noqa: BLE001
            capture.flush()
            task.error = f"{type(exc).__name__}: {exc}"
            task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")
            task.status = "failed"
        finally:
            task.finished_at = _now_iso()

    def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:
        """Run build then eval, updating task.phase as we go."""

        # ── resolve paths ──────────────────────────────────────────────
        docs_path = Path(req.docs_path)
        if not docs_path.is_absolute():
            docs_path = (_REPO_ROOT / docs_path).resolve()
        if not docs_path.is_dir():
            raise ValueError(f"docs_path is not an existing directory: {docs_path}")

        job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id
        build_artifact_dir = job_output_dir / "build"
        dataset_csv = job_output_dir / "generated_dataset.csv"
        eval_output_dir = job_output_dir / "eval"

        # ── phase 1 + 2: dataset build (parse & generate) ─────────────
        task.phase = "parsing_documents"
        task.append_log(f"  [build] 扫描文档目录: {docs_path}")
        build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)

        source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"
        total_q = len(build_result.draft_samples)
        parse_failures = len(build_result.parse_failures)
        task.append_log(f"  [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")

        if total_q == 0:
            raise RuntimeError("题库为空（所有文档均解析或生成失败），中止评估。")

        # ── phase 3: evaluation ────────────────────────────────────────
        task.phase = "evaluating"
        task.append_log(f"  [eval] 开始 RAGAS 评估，共 {total_q} 道题目")
        eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)

        from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths
        eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)

        return PipelineResult(
            build_artifact_dir=build_artifact_dir.as_posix(),
            dataset_csv=dataset_csv.as_posix(),
            source_chunks_jsonl=source_chunks_jsonl.as_posix(),
            total_questions=total_q,
            parse_failures=parse_failures,
            eval_run_id=eval_result.run_id,
            eval_output_dir=eval_result.scenario.output_dir.as_posix(),
            scores_csv=eval_artifact_paths.scores_csv.as_posix(),
            summary_md=eval_artifact_paths.summary_md.as_posix(),
        )

    def _run_build(self, task: PipelineTask, req: PipelineJobRequest,
                   docs_path: Path, artifact_dir: Path, dataset_csv: Path):
        """Construct DatasetBuildJob and run the build phase."""
        from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
        from rag_eval.dataset_builder.runner import execute_dataset_build_job
        from rag_eval.settings import EvaluationSettings

        settings = EvaluationSettings()
        job = DatasetBuildJob(
            job_name=task.job_name,
            input_path=docs_path,
            input_glob="*.pdf",
            parser_provider="aliyun_docmind",
            failure_mode=req.failure_mode,  # type: ignore[arg-type]
            generation_model=req.generation_model,
            output_type="online_question_bank",
            review_mode="draft_with_manual_review",
            max_questions_per_document=req.max_questions_per_document,
            max_source_chunks_per_question=req.max_source_chunks_per_question,
            dataset_path=dataset_csv,
            artifact_dir=artifact_dir,
            runtime=DatasetBuildRuntime(max_documents=req.max_documents),
        )
        return execute_dataset_build_job(job, settings=settings)

    def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,
                  dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):
        """Construct Scenario and run the evaluation phase."""
        from rag_eval.execution.runner import run_scenario_from_scenario_obj
        from rag_eval.settings import EvaluationSettings
        from rag_eval.shared.models import (
            AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,
        )

        settings = EvaluationSettings()
        scenario = Scenario(
            scenario_name=task.job_name,
            mode="online",
            dataset=DatasetConfig(path=dataset_csv),
            judge_model=req.judge_model,
            embedding_model=req.embedding_model,
            metrics=list(req.metrics),
            output_dir=eval_output_dir,
            runtime=RuntimeConfig(
                batch_size=4,
                app_concurrency=2,
                metric_concurrency=2,
                max_samples=req.max_samples,
            ),
            app_adapter=AppAdapterConfig(
                type="python",
                callable="apps.siemens_pdf_qa.adapter:run",
                static_kwargs={
                    "source_chunks_path": source_chunks_jsonl,
                    "model": req.answer_model,
                },
            ),
            optimization_advisor=req.optimization_advisor,
        )
        return run_scenario_from_scenario_obj(scenario, settings=settings)


# Module-level singleton shared by the FastAPI routes.
pipeline_task_manager = PipelineTaskManager()
feat(session-async): add /api/score/session_async with incremental session report aggregation - New POST /api/score/session_async endpoint: same session_id calls append to one shared report - New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records - New GET /api/score/session/jobs/{job_id}: individual call status - SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call - SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added - 24 new tests, all passing chore(weighted-score): comment out 综合加权得分 display and computation - report.js: hide 综合加权得分 card in report detail page - score_jobs.js: hide 综合 chip in async job list - report_builder.py: overall_ws=None (computation disabled) - summary.py: weighted_score summary line disabled - evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv - score.py /api/score: weighted_score always returns null - score_job_manager.py + session_score_manager.py: weighted=None - Updated 3 tests to match new behaviour (6 pre-existing failures unchanged) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> 2026-06-26 16:09:33 +08:00			`"""Background task manager for end-to-end pipeline jobs (build + eval).`

			`Each job runs three sequential phases inside a worker thread:`
			`1. parsing_documents — AliyunDocmind parses every PDF`
			`2. generating_questions — LLM generates a draft question bank`
			`3. evaluating — RAGAS online evaluation scores each question`

			`The DatasetBuildJob and Scenario objects are constructed entirely from the`
			`API request parameters, so no YAML config files are needed.`
			`"""`

			`from __future__ import annotations`

			`import io`
			`import threading`
			`import uuid`
			`from concurrent.futures import ThreadPoolExecutor`
			`from contextlib import redirect_stderr, redirect_stdout`
			`from datetime import datetime, timezone`
			`from pathlib import Path`

			`from webapp.models import (`
			`PipelineJobRequest,`
			`PipelineJobStatus,`
			`PipelineResult,`
			`)`

			`_REPO_ROOT = Path(__file__).resolve().parents[2]`
			`_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"`


			`def _now_iso() -> str:`
			`return datetime.now(timezone.utc).isoformat()`


			`class _LineCapture(io.TextIOBase):`
			`"""Write-only stream that appends complete lines to a task's log buffer."""`

			`def __init__(self, sink: "PipelineTask") -> None:`
			`self._sink = sink`
			`self._buffer = ""`

			`def write(self, text: str) -> int:`
			`self._buffer += text`
			`while "\n" in self._buffer:`
			`line, self._buffer = self._buffer.split("\n", 1)`
			`self._sink.append_log(line)`
			`return len(text)`

			`def flush(self) -> None:`
			`if self._buffer:`
			`self._sink.append_log(self._buffer)`
			`self._buffer = ""`


			`class PipelineTask:`
			`"""Mutable state for one pipeline job (build + eval)."""`

			`def __init__(self, job_id: str, job_name: str) -> None:`
			`self.job_id = job_id`
			`self.job_name = job_name`
			`self.status = "queued"`
			`self.phase = "idle"`
			`self.logs: list[str] = []`
			`self.result: PipelineResult \| None = None`
			`self.error: str \| None = None`
			`self.created_at = _now_iso()`
			`self.finished_at = ""`
			`self._lock = threading.Lock()`

			`def append_log(self, line: str) -> None:`
			`with self._lock:`
			`self.logs.append(line)`

			`def snapshot(self) -> PipelineJobStatus:`
			`with self._lock:`
			`return PipelineJobStatus(`
			`job_id=self.job_id,`
			`job_name=self.job_name,`
			`status=self.status,`
			`phase=self.phase,`
			`logs=list(self.logs),`
			`result=self.result,`
			`error=self.error,`
			`created_at=self.created_at,`
			`finished_at=self.finished_at,`
			`)`


			`class PipelineTaskManager:`
			`"""Owns the thread pool and registry of pipeline jobs."""`

			`def __init__(self, max_workers: int = 2) -> None:`
			`self._executor = ThreadPoolExecutor(max_workers=max_workers)`
			`self._tasks: dict[str, PipelineTask] = {}`
			`self._lock = threading.Lock()`

			`def submit(self, request: PipelineJobRequest) -> PipelineTask:`
			`"""Register and schedule a new pipeline job; return its task object."""`
			`job_id = uuid.uuid4().hex[:12]`
			`job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"`
			`task = PipelineTask(job_id=job_id, job_name=job_name)`
			`with self._lock:`
			`self._tasks[job_id] = task`
			`self._executor.submit(self._run, task, request)`
			`return task`

			`def get(self, job_id: str) -> PipelineJobStatus \| None:`
			`with self._lock:`
			`task = self._tasks.get(job_id)`
			`return task.snapshot() if task is not None else None`

			`def list_jobs(self) -> list[PipelineJobStatus]:`
			`with self._lock:`
			`tasks = list(self._tasks.values())`
			`snapshots = [t.snapshot() for t in tasks]`
			`snapshots.sort(key=lambda s: s.created_at, reverse=True)`
			`return snapshots`

			`# ------------------------------------------------------------------ #`
			`# Worker`
			`# ------------------------------------------------------------------ #`

			`def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:`
			`"""Execute the full pipeline end to end inside a worker thread."""`
			`task.status = "running"`
			`task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")`

			`capture = _LineCapture(task)`
			`try:`
			`with redirect_stdout(capture), redirect_stderr(capture):`
			`result = self._execute(task, request)`
			`capture.flush()`
			`task.result = result`
			`task.phase = "done"`
			`task.status = "completed"`
			`task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")`
			`except Exception as exc: # noqa: BLE001`
			`capture.flush()`
			`task.error = f"{type(exc).__name__}: {exc}"`
			`task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")`
			`task.status = "failed"`
			`finally:`
			`task.finished_at = _now_iso()`

			`def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:`
			`"""Run build then eval, updating task.phase as we go."""`

			`# ── resolve paths ──────────────────────────────────────────────`
			`docs_path = Path(req.docs_path)`
			`if not docs_path.is_absolute():`
			`docs_path = (_REPO_ROOT / docs_path).resolve()`
			`if not docs_path.is_dir():`
			`raise ValueError(f"docs_path is not an existing directory: {docs_path}")`

			`job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id`
			`build_artifact_dir = job_output_dir / "build"`
			`dataset_csv = job_output_dir / "generated_dataset.csv"`
			`eval_output_dir = job_output_dir / "eval"`

			`# ── phase 1 + 2: dataset build (parse & generate) ─────────────`
			`task.phase = "parsing_documents"`
			`task.append_log(f" [build] 扫描文档目录: {docs_path}")`
			`build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)`

			`source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"`
			`total_q = len(build_result.draft_samples)`
			`parse_failures = len(build_result.parse_failures)`
			`task.append_log(f" [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")`

			`if total_q == 0:`
			`raise RuntimeError("题库为空（所有文档均解析或生成失败），中止评估。")`

			`# ── phase 3: evaluation ────────────────────────────────────────`
			`task.phase = "evaluating"`
			`task.append_log(f" [eval] 开始 RAGAS 评估，共 {total_q} 道题目")`
			`eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)`

			`from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths`
			`eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)`

			`return PipelineResult(`
			`build_artifact_dir=build_artifact_dir.as_posix(),`
			`dataset_csv=dataset_csv.as_posix(),`
			`source_chunks_jsonl=source_chunks_jsonl.as_posix(),`
			`total_questions=total_q,`
			`parse_failures=parse_failures,`
			`eval_run_id=eval_result.run_id,`
			`eval_output_dir=eval_result.scenario.output_dir.as_posix(),`
			`scores_csv=eval_artifact_paths.scores_csv.as_posix(),`
			`summary_md=eval_artifact_paths.summary_md.as_posix(),`
			`)`

			`def _run_build(self, task: PipelineTask, req: PipelineJobRequest,`
			`docs_path: Path, artifact_dir: Path, dataset_csv: Path):`
			`"""Construct DatasetBuildJob and run the build phase."""`
			`from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime`
			`from rag_eval.dataset_builder.runner import execute_dataset_build_job`
			`from rag_eval.settings import EvaluationSettings`

			`settings = EvaluationSettings()`
			`job = DatasetBuildJob(`
			`job_name=task.job_name,`
			`input_path=docs_path,`
			`input_glob="*.pdf",`
			`parser_provider="aliyun_docmind",`
			`failure_mode=req.failure_mode, # type: ignore[arg-type]`
			`generation_model=req.generation_model,`
			`output_type="online_question_bank",`
			`review_mode="draft_with_manual_review",`
			`max_questions_per_document=req.max_questions_per_document,`
			`max_source_chunks_per_question=req.max_source_chunks_per_question,`
			`dataset_path=dataset_csv,`
			`artifact_dir=artifact_dir,`
			`runtime=DatasetBuildRuntime(max_documents=req.max_documents),`
			`)`
			`return execute_dataset_build_job(job, settings=settings)`

			`def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,`
			`dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):`
			`"""Construct Scenario and run the evaluation phase."""`
			`from rag_eval.execution.runner import run_scenario_from_scenario_obj`
			`from rag_eval.settings import EvaluationSettings`
			`from rag_eval.shared.models import (`
			`AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,`
			`)`

			`settings = EvaluationSettings()`
			`scenario = Scenario(`
			`scenario_name=task.job_name,`
			`mode="online",`
			`dataset=DatasetConfig(path=dataset_csv),`
			`judge_model=req.judge_model,`
			`embedding_model=req.embedding_model,`
			`metrics=list(req.metrics),`
			`output_dir=eval_output_dir,`
			`runtime=RuntimeConfig(`
			`batch_size=4,`
			`app_concurrency=2,`
			`metric_concurrency=2,`
			`max_samples=req.max_samples,`
			`),`
			`app_adapter=AppAdapterConfig(`
			`type="python",`
			`callable="apps.siemens_pdf_qa.adapter:run",`
			`static_kwargs={`
			`"source_chunks_path": source_chunks_jsonl,`
			`"model": req.answer_model,`
			`},`
			`),`
			`optimization_advisor=req.optimization_advisor,`
			`)`
			`return run_scenario_from_scenario_obj(scenario, settings=settings)`


			`# Module-level singleton shared by the FastAPI routes.`
			`pipeline_task_manager = PipelineTaskManager()`