feat: async score jobs — POST /api/score/async + 评分记录 page

Each async score job:
- Runs InlineScorer.score() in thread pool
- Writes standard run artifacts (metadata.json, scores.csv, summary.md)
- Runs optimization_advisor => optimization_advice.md
- Result appears in 运行列表 and 报告详情 with full report

New endpoints:
- POST /api/score/async  (202, job_id immediate)
- GET  /api/score/jobs   (list all jobs)
- GET  /api/score/jobs/{id} (single job status)

Frontend:
- 评分记录 nav page with card list
- 5s auto-polling for queued/running jobs
- 查看报告 button navigates to existing 报告详情 page

Dify: change /api/score -> /api/score/async, no response parsing needed

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-06-24 17:24:22 +08:00
parent abcd61ec8f
commit 4fd515d2d9
9 changed files with 706 additions and 11 deletions

View File

@@ -0,0 +1,269 @@
"""Background task manager for async RAGAS single-sample scoring.
Each job:
1. Runs InlineScorer.score() in a thread pool.
2. Constructs a minimal EvaluationResult + Scenario in the standard format.
3. Calls write_run_artifacts() — produces metadata.json, scores.csv, summary.md.
4. Calls run_advisor() — produces optimization_advice.md.
The resulting run directory lands under outputs/score-async/<run_id>/ and is
automatically picked up by run_reader.list_run_summaries(), so it appears in
the existing 「运行列表」 and 「报告详情」 pages without any extra wiring.
"""
from __future__ import annotations
import json
import math
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from webapp.models import AsyncScoreJobStatus, ScoreRequest
_REPO_ROOT = Path(__file__).resolve().parents[2]
_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-async"
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-jobs" # lightweight job index
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
class ScoreJobManager:
"""Thread-pool manager for async scoring jobs.
Results are written as standard run artifacts so the report detail page
can render them with zero additional code.
"""
def __init__(
self,
output_dir: Path = _DEFAULT_JOBS_DIR,
index_dir: Path = _DEFAULT_INDEX_DIR,
max_workers: int = 4,
) -> None:
self._output_dir = Path(output_dir)
self._index_dir = Path(index_dir)
self._output_dir.mkdir(parents=True, exist_ok=True)
self._index_dir.mkdir(parents=True, exist_ok=True)
self._executor = ThreadPoolExecutor(max_workers=max_workers)
self._cache: dict[str, AsyncScoreJobStatus] = {}
self._lock = threading.Lock()
self._load_existing()
# ------------------------------------------------------------------ #
# Public API
# ------------------------------------------------------------------ #
def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
"""Queue one scoring job and return its initial status immediately."""
job_id = uuid.uuid4().hex[:12]
status = AsyncScoreJobStatus(
job_id=job_id,
status="queued",
created_at=_now_iso(),
request_summary={
"question": (request.question or "")[:80],
"answer": (request.answer or "")[:80],
"metrics": list(request.metrics),
"judge_model": request.judge_model or "",
"embedding_model": request.embedding_model or "",
"has_contexts": bool(request.contexts),
"has_ground_truth": bool(request.ground_truth),
},
)
with self._lock:
self._cache[job_id] = status
self._persist_index(status)
self._executor.submit(self._run, job_id, request)
return status
def get(self, job_id: str) -> AsyncScoreJobStatus | None:
"""Return current status or None if unknown."""
with self._lock:
return self._cache.get(job_id)
def list_jobs(self) -> list[AsyncScoreJobStatus]:
"""Return all known jobs, newest first."""
with self._lock:
jobs = list(self._cache.values())
jobs.sort(key=lambda j: j.created_at, reverse=True)
return jobs
# ------------------------------------------------------------------ #
# Worker
# ------------------------------------------------------------------ #
def _run(self, job_id: str, request: ScoreRequest) -> None:
"""Execute scoring, write run artifacts, run advisor."""
import logging
logger = logging.getLogger("webapp.services.score_job_manager")
self._update(job_id, status="running")
# Lazy imports to keep web server bootable if ragas is not installed.
from rag_eval.advisor import run_advisor
from rag_eval.metrics.factory import build_models
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.reporting.writers import write_run_artifacts
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import (
DatasetConfig, EvaluationResult, NormalizedSample,
RuntimeConfig, Scenario,
)
from rag_eval.shared.utils import utc_now_iso
from webapp.services.inline_scorer import inline_scorer
settings = EvaluationSettings()
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
t0 = time.monotonic()
started_at = utc_now_iso()
try:
if effective:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
else:
raw_scores = {}
latency_ms = int((time.monotonic() - t0) * 1000)
finished_at = utc_now_iso()
# Build full scores dict (skipped = None)
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
all_scores.update(raw_scores)
weighted_raw = compute_weighted_score(
{k: v for k, v in raw_scores.items() if v is not None}, {}
)
weighted = round(weighted_raw, 4) if weighted_raw is not None else None
# Build a score row compatible with report_builder
score_row: dict[str, Any] = {
"sample_id": "async-score-1",
"question": request.question,
"answer": request.answer or "",
"contexts": request.contexts or "",
"ground_truth": request.ground_truth or "",
"error": "",
}
score_row.update(all_scores)
# Construct minimal EvaluationResult so write_run_artifacts works
run_id = finished_at.replace(":", "-")
output_dir = self._output_dir
# Build a minimal Scenario for snapshot + advisor
scenario = Scenario(
scenario_name=f"async-score-{job_id}",
mode="offline",
dataset=DatasetConfig(path=output_dir / run_id / "dataset.csv"),
judge_model=judge_model,
embedding_model=embedding_model,
metrics=list(request.metrics),
output_dir=output_dir,
optimization_advisor=True, # always generate advice
)
sample = NormalizedSample(
sample_id="async-score-1",
question=request.question,
answer=request.answer or "",
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth or "",
)
result = EvaluationResult(
scenario=scenario,
run_id=run_id,
started_at=started_at,
finished_at=finished_at,
valid_samples=[sample],
invalid_samples=[],
score_rows=[score_row],
)
write_run_artifacts(result)
logger.info("[score_job] artifacts written job_id=%s run_id=%s", job_id, run_id)
# Run optimization advisor (builds optimization_advice.md)
try:
llm, _ = build_models(judge_model, embedding_model, settings)
run_advisor(result, scenario, llm)
logger.info("[score_job] advisor done job_id=%s", job_id)
except Exception as adv_exc: # noqa: BLE001
logger.warning("[score_job] advisor failed job_id=%s err=%s", job_id, adv_exc)
self._update(
job_id,
status="completed",
finished_at=finished_at,
run_id=run_id,
scores=all_scores,
weighted_score=weighted,
latency_ms=latency_ms,
skipped_metrics=skipped,
)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
logger.error("[score_job] failed job_id=%s err=%s", job_id, exc)
self._update(
job_id,
status="failed",
finished_at=_now_iso(),
latency_ms=latency_ms,
error=f"{type(exc).__name__}: {exc}",
)
# ------------------------------------------------------------------ #
# Persistence helpers
# ------------------------------------------------------------------ #
def _update(self, job_id: str, **kwargs: Any) -> None:
"""Merge kwargs into the job status and persist the index."""
with self._lock:
existing = self._cache.get(job_id)
if existing is None:
return
updated = existing.model_copy(update=kwargs)
self._cache[job_id] = updated
self._persist_index(updated)
def _persist_index(self, status: AsyncScoreJobStatus) -> None:
"""Write a lightweight index JSON for this job (survives restarts)."""
path = self._index_dir / f"{status.job_id}.json"
path.write_text(
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _load_existing(self) -> None:
"""Load existing job index files on startup."""
for path in sorted(self._index_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
status = AsyncScoreJobStatus.model_validate(data)
self._cache[status.job_id] = status
except Exception: # noqa: BLE001
pass
# Module-level singleton shared by FastAPI routes.
score_job_manager = ScoreJobManager()