"""Background task manager for async RAGAS single-sample scoring. Each job: 1. Runs InlineScorer.score() in a thread pool. 2. Constructs a minimal EvaluationResult + Scenario in the standard format. 3. Calls write_run_artifacts() — produces metadata.json, scores.csv, summary.md. 4. Calls run_advisor() — produces optimization_advice.md. The resulting run directory lands under outputs/score-async// and is automatically picked up by run_reader.list_run_summaries(), so it appears in the existing 「运行列表」 and 「报告详情」 pages without any extra wiring. """ from __future__ import annotations import json import math import threading import time import uuid from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timezone from pathlib import Path from typing import Any from webapp.models import AsyncScoreJobStatus, ScoreRequest _REPO_ROOT = Path(__file__).resolve().parents[2] _DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-async" _DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-jobs" # lightweight job index def _now_iso() -> str: return datetime.now(timezone.utc).isoformat() class ScoreJobManager: """Thread-pool manager for async scoring jobs. Results are written as standard run artifacts so the report detail page can render them with zero additional code. """ def __init__( self, output_dir: Path = _DEFAULT_JOBS_DIR, index_dir: Path = _DEFAULT_INDEX_DIR, max_workers: int = 4, ) -> None: self._output_dir = Path(output_dir) self._index_dir = Path(index_dir) self._output_dir.mkdir(parents=True, exist_ok=True) self._index_dir.mkdir(parents=True, exist_ok=True) self._executor = ThreadPoolExecutor(max_workers=max_workers) self._cache: dict[str, AsyncScoreJobStatus] = {} self._lock = threading.Lock() self._load_existing() # ------------------------------------------------------------------ # # Public API # ------------------------------------------------------------------ # def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus: """Queue one scoring job and return its initial status immediately.""" job_id = uuid.uuid4().hex[:12] status = AsyncScoreJobStatus( job_id=job_id, status="queued", created_at=_now_iso(), request_summary={ "question": (request.question or "")[:80], "answer": (request.answer or "")[:80], "metrics": list(request.metrics), "judge_model": request.judge_model or "", "embedding_model": request.embedding_model or "", "has_contexts": bool(request.contexts), "has_ground_truth": bool(request.ground_truth), }, ) with self._lock: self._cache[job_id] = status self._persist_index(status) self._executor.submit(self._run, job_id, request) return status def get(self, job_id: str) -> AsyncScoreJobStatus | None: """Return current status or None if unknown.""" with self._lock: return self._cache.get(job_id) def list_jobs(self) -> list[AsyncScoreJobStatus]: """Return all known jobs, newest first.""" with self._lock: jobs = list(self._cache.values()) jobs.sort(key=lambda j: j.created_at, reverse=True) return jobs # ------------------------------------------------------------------ # # Worker # ------------------------------------------------------------------ # def _run(self, job_id: str, request: ScoreRequest) -> None: """Execute scoring, write run artifacts, run advisor.""" import logging logger = logging.getLogger("webapp.services.score_job_manager") self._update(job_id, status="running") # Lazy imports to keep web server bootable if ragas is not installed. from rag_eval.advisor import run_advisor from rag_eval.metrics.factory import build_models from rag_eval.metrics.weights import compute_weighted_score from rag_eval.reporting.writers import write_run_artifacts from rag_eval.settings import EvaluationSettings from rag_eval.shared.models import ( DatasetConfig, EvaluationResult, NormalizedSample, RuntimeConfig, Scenario, ) from rag_eval.shared.utils import utc_now_iso from webapp.services.inline_scorer import inline_scorer settings = EvaluationSettings() judge_model = request.judge_model or settings.ragas_judge_model embedding_model = request.embedding_model or settings.ragas_embedding_model effective = request.effective_metrics() requested = set(request.metrics) skipped = sorted(requested - set(effective)) t0 = time.monotonic() started_at = utc_now_iso() try: if effective: raw_scores = inline_scorer.score( question=request.question, answer=request.answer, contexts=request.contexts_as_list(), ground_truth=request.ground_truth, metrics=effective, judge_model=judge_model, embedding_model=embedding_model, settings=settings, ) else: raw_scores = {} latency_ms = int((time.monotonic() - t0) * 1000) finished_at = utc_now_iso() # Build full scores dict (skipped = None) all_scores: dict[str, float | None] = {m: None for m in request.metrics} all_scores.update(raw_scores) weighted_raw = compute_weighted_score( {k: v for k, v in raw_scores.items() if v is not None}, {} ) weighted = round(weighted_raw, 4) if weighted_raw is not None else None # Build a score row compatible with report_builder score_row: dict[str, Any] = { "sample_id": "async-score-1", "question": request.question, "answer": request.answer or "", "contexts": request.contexts or "", "ground_truth": request.ground_truth or "", "error": "", } score_row.update(all_scores) # Construct minimal EvaluationResult so write_run_artifacts works run_id = finished_at.replace(":", "-") output_dir = self._output_dir # Build a minimal Scenario for snapshot + advisor scenario = Scenario( scenario_name=f"async-score-{job_id}", mode="offline", dataset=DatasetConfig(path=output_dir / run_id / "dataset.csv"), judge_model=judge_model, embedding_model=embedding_model, metrics=list(request.metrics), output_dir=output_dir, optimization_advisor=True, # always generate advice ) sample = NormalizedSample( sample_id="async-score-1", question=request.question, answer=request.answer or "", contexts=request.contexts_as_list(), ground_truth=request.ground_truth or "", ) result = EvaluationResult( scenario=scenario, run_id=run_id, started_at=started_at, finished_at=finished_at, valid_samples=[sample], invalid_samples=[], score_rows=[score_row], ) write_run_artifacts(result) logger.info("[score_job] artifacts written job_id=%s run_id=%s", job_id, run_id) # Run optimization advisor (builds optimization_advice.md) try: llm, _ = build_models(judge_model, embedding_model, settings) run_advisor(result, scenario, llm) logger.info("[score_job] advisor done job_id=%s", job_id) except Exception as adv_exc: # noqa: BLE001 logger.warning("[score_job] advisor failed job_id=%s err=%s", job_id, adv_exc) self._update( job_id, status="completed", finished_at=finished_at, run_id=run_id, scores=all_scores, weighted_score=weighted, latency_ms=latency_ms, skipped_metrics=skipped, ) except Exception as exc: # noqa: BLE001 latency_ms = int((time.monotonic() - t0) * 1000) logger.error("[score_job] failed job_id=%s err=%s", job_id, exc) self._update( job_id, status="failed", finished_at=_now_iso(), latency_ms=latency_ms, error=f"{type(exc).__name__}: {exc}", ) # ------------------------------------------------------------------ # # Persistence helpers # ------------------------------------------------------------------ # def _update(self, job_id: str, **kwargs: Any) -> None: """Merge kwargs into the job status and persist the index.""" with self._lock: existing = self._cache.get(job_id) if existing is None: return updated = existing.model_copy(update=kwargs) self._cache[job_id] = updated self._persist_index(updated) def _persist_index(self, status: AsyncScoreJobStatus) -> None: """Write a lightweight index JSON for this job (survives restarts).""" path = self._index_dir / f"{status.job_id}.json" path.write_text( json.dumps(status.model_dump(), ensure_ascii=False, indent=2), encoding="utf-8", ) def _load_existing(self) -> None: """Load existing job index files on startup.""" for path in sorted(self._index_dir.glob("*.json")): try: data = json.loads(path.read_text(encoding="utf-8")) status = AsyncScoreJobStatus.model_validate(data) self._cache[status.job_id] = status except Exception: # noqa: BLE001 pass # Module-level singleton shared by FastAPI routes. score_job_manager = ScoreJobManager()