feat: async score jobs — POST /api/score/async + 评分记录 page

Each async score job: - Runs InlineScorer.score() in thread pool - Writes standard run artifacts (metadata.json, scores.csv, summary.md) - Runs optimization_advisor => optimization_advice.md - Result appears in 运行列表 and 报告详情 with full report New endpoints: - POST /api/score/async (202, job_id immediate) - GET /api/score/jobs (list all jobs) - GET /api/score/jobs/{id} (single job status) Frontend: - 评分记录 nav page with card list - 5s auto-polling for queued/running jobs - 查看报告 button navigates to existing 报告详情 page Dify: change /api/score -> /api/score/async, no response parsing needed Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-24 17:24:22 +08:00
parent abcd61ec8f
commit 4fd515d2d9
9 changed files with 706 additions and 11 deletions
--- a/webapp/services/score_job_manager.py
+++ b/webapp/services/score_job_manager.py
@@ -0,0 +1,269 @@
+"""Background task manager for async RAGAS single-sample scoring.
+
+Each job:
+  1. Runs InlineScorer.score() in a thread pool.
+  2. Constructs a minimal EvaluationResult + Scenario in the standard format.
+  3. Calls write_run_artifacts() — produces metadata.json, scores.csv, summary.md.
+  4. Calls run_advisor() — produces optimization_advice.md.
+
+The resulting run directory lands under outputs/score-async/<run_id>/ and is
+automatically picked up by run_reader.list_run_summaries(), so it appears in
+the existing 「运行列表」 and 「报告详情」 pages without any extra wiring.
+"""
+
+from __future__ import annotations
+
+import json
+import math
+import threading
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from webapp.models import AsyncScoreJobStatus, ScoreRequest
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-async"
+_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-jobs"  # lightweight job index
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+class ScoreJobManager:
+    """Thread-pool manager for async scoring jobs.
+
+    Results are written as standard run artifacts so the report detail page
+    can render them with zero additional code.
+    """
+
+    def __init__(
+        self,
+        output_dir: Path = _DEFAULT_JOBS_DIR,
+        index_dir: Path = _DEFAULT_INDEX_DIR,
+        max_workers: int = 4,
+    ) -> None:
+        self._output_dir = Path(output_dir)
+        self._index_dir = Path(index_dir)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._index_dir.mkdir(parents=True, exist_ok=True)
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._cache: dict[str, AsyncScoreJobStatus] = {}
+        self._lock = threading.Lock()
+        self._load_existing()
+
+    # ------------------------------------------------------------------ #
+    # Public API
+    # ------------------------------------------------------------------ #
+
+    def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
+        """Queue one scoring job and return its initial status immediately."""
+        job_id = uuid.uuid4().hex[:12]
+        status = AsyncScoreJobStatus(
+            job_id=job_id,
+            status="queued",
+            created_at=_now_iso(),
+            request_summary={
+                "question": (request.question or "")[:80],
+                "answer": (request.answer or "")[:80],
+                "metrics": list(request.metrics),
+                "judge_model": request.judge_model or "",
+                "embedding_model": request.embedding_model or "",
+                "has_contexts": bool(request.contexts),
+                "has_ground_truth": bool(request.ground_truth),
+            },
+        )
+        with self._lock:
+            self._cache[job_id] = status
+        self._persist_index(status)
+        self._executor.submit(self._run, job_id, request)
+        return status
+
+    def get(self, job_id: str) -> AsyncScoreJobStatus | None:
+        """Return current status or None if unknown."""
+        with self._lock:
+            return self._cache.get(job_id)
+
+    def list_jobs(self) -> list[AsyncScoreJobStatus]:
+        """Return all known jobs, newest first."""
+        with self._lock:
+            jobs = list(self._cache.values())
+        jobs.sort(key=lambda j: j.created_at, reverse=True)
+        return jobs
+
+    # ------------------------------------------------------------------ #
+    # Worker
+    # ------------------------------------------------------------------ #
+
+    def _run(self, job_id: str, request: ScoreRequest) -> None:
+        """Execute scoring, write run artifacts, run advisor."""
+        import logging
+        logger = logging.getLogger("webapp.services.score_job_manager")
+        self._update(job_id, status="running")
+
+        # Lazy imports to keep web server bootable if ragas is not installed.
+        from rag_eval.advisor import run_advisor
+        from rag_eval.metrics.factory import build_models
+        from rag_eval.metrics.weights import compute_weighted_score
+        from rag_eval.reporting.writers import write_run_artifacts
+        from rag_eval.settings import EvaluationSettings
+        from rag_eval.shared.models import (
+            DatasetConfig, EvaluationResult, NormalizedSample,
+            RuntimeConfig, Scenario,
+        )
+        from rag_eval.shared.utils import utc_now_iso
+        from webapp.services.inline_scorer import inline_scorer
+
+        settings = EvaluationSettings()
+        judge_model = request.judge_model or settings.ragas_judge_model
+        embedding_model = request.embedding_model or settings.ragas_embedding_model
+        effective = request.effective_metrics()
+        requested = set(request.metrics)
+        skipped = sorted(requested - set(effective))
+
+        t0 = time.monotonic()
+        started_at = utc_now_iso()
+
+        try:
+            if effective:
+                raw_scores = inline_scorer.score(
+                    question=request.question,
+                    answer=request.answer,
+                    contexts=request.contexts_as_list(),
+                    ground_truth=request.ground_truth,
+                    metrics=effective,
+                    judge_model=judge_model,
+                    embedding_model=embedding_model,
+                    settings=settings,
+                )
+            else:
+                raw_scores = {}
+
+            latency_ms = int((time.monotonic() - t0) * 1000)
+            finished_at = utc_now_iso()
+
+            # Build full scores dict (skipped = None)
+            all_scores: dict[str, float | None] = {m: None for m in request.metrics}
+            all_scores.update(raw_scores)
+            weighted_raw = compute_weighted_score(
+                {k: v for k, v in raw_scores.items() if v is not None}, {}
+            )
+            weighted = round(weighted_raw, 4) if weighted_raw is not None else None
+
+            # Build a score row compatible with report_builder
+            score_row: dict[str, Any] = {
+                "sample_id": "async-score-1",
+                "question": request.question,
+                "answer": request.answer or "",
+                "contexts": request.contexts or "",
+                "ground_truth": request.ground_truth or "",
+                "error": "",
+            }
+            score_row.update(all_scores)
+
+            # Construct minimal EvaluationResult so write_run_artifacts works
+            run_id = finished_at.replace(":", "-")
+            output_dir = self._output_dir
+
+            # Build a minimal Scenario for snapshot + advisor
+            scenario = Scenario(
+                scenario_name=f"async-score-{job_id}",
+                mode="offline",
+                dataset=DatasetConfig(path=output_dir / run_id / "dataset.csv"),
+                judge_model=judge_model,
+                embedding_model=embedding_model,
+                metrics=list(request.metrics),
+                output_dir=output_dir,
+                optimization_advisor=True,  # always generate advice
+            )
+
+            sample = NormalizedSample(
+                sample_id="async-score-1",
+                question=request.question,
+                answer=request.answer or "",
+                contexts=request.contexts_as_list(),
+                ground_truth=request.ground_truth or "",
+            )
+
+            result = EvaluationResult(
+                scenario=scenario,
+                run_id=run_id,
+                started_at=started_at,
+                finished_at=finished_at,
+                valid_samples=[sample],
+                invalid_samples=[],
+                score_rows=[score_row],
+            )
+
+            write_run_artifacts(result)
+            logger.info("[score_job] artifacts written  job_id=%s  run_id=%s", job_id, run_id)
+
+            # Run optimization advisor (builds optimization_advice.md)
+            try:
+                llm, _ = build_models(judge_model, embedding_model, settings)
+                run_advisor(result, scenario, llm)
+                logger.info("[score_job] advisor done  job_id=%s", job_id)
+            except Exception as adv_exc:  # noqa: BLE001
+                logger.warning("[score_job] advisor failed  job_id=%s  err=%s", job_id, adv_exc)
+
+            self._update(
+                job_id,
+                status="completed",
+                finished_at=finished_at,
+                run_id=run_id,
+                scores=all_scores,
+                weighted_score=weighted,
+                latency_ms=latency_ms,
+                skipped_metrics=skipped,
+            )
+
+        except Exception as exc:  # noqa: BLE001
+            latency_ms = int((time.monotonic() - t0) * 1000)
+            logger.error("[score_job] failed  job_id=%s  err=%s", job_id, exc)
+            self._update(
+                job_id,
+                status="failed",
+                finished_at=_now_iso(),
+                latency_ms=latency_ms,
+                error=f"{type(exc).__name__}: {exc}",
+            )
+
+    # ------------------------------------------------------------------ #
+    # Persistence helpers
+    # ------------------------------------------------------------------ #
+
+    def _update(self, job_id: str, **kwargs: Any) -> None:
+        """Merge kwargs into the job status and persist the index."""
+        with self._lock:
+            existing = self._cache.get(job_id)
+            if existing is None:
+                return
+            updated = existing.model_copy(update=kwargs)
+            self._cache[job_id] = updated
+        self._persist_index(updated)
+
+    def _persist_index(self, status: AsyncScoreJobStatus) -> None:
+        """Write a lightweight index JSON for this job (survives restarts)."""
+        path = self._index_dir / f"{status.job_id}.json"
+        path.write_text(
+            json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    def _load_existing(self) -> None:
+        """Load existing job index files on startup."""
+        for path in sorted(self._index_dir.glob("*.json")):
+            try:
+                data = json.loads(path.read_text(encoding="utf-8"))
+                status = AsyncScoreJobStatus.model_validate(data)
+                self._cache[status.job_id] = status
+            except Exception:  # noqa: BLE001
+                pass
+
+
+# Module-level singleton shared by FastAPI routes.
+score_job_manager = ScoreJobManager()