feat: async score jobs — POST /api/score/async + 评分记录 page
Each async score job:
- Runs InlineScorer.score() in thread pool
- Writes standard run artifacts (metadata.json, scores.csv, summary.md)
- Runs optimization_advisor => optimization_advice.md
- Result appears in 运行列表 and 报告详情 with full report
New endpoints:
- POST /api/score/async (202, job_id immediate)
- GET /api/score/jobs (list all jobs)
- GET /api/score/jobs/{id} (single job status)
Frontend:
- 评分记录 nav page with card list
- 5s auto-polling for queued/running jobs
- 查看报告 button navigates to existing 报告详情 page
Dify: change /api/score -> /api/score/async, no response parsing needed
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
269
webapp/services/score_job_manager.py
Normal file
269
webapp/services/score_job_manager.py
Normal file
@@ -0,0 +1,269 @@
|
||||
"""Background task manager for async RAGAS single-sample scoring.
|
||||
|
||||
Each job:
|
||||
1. Runs InlineScorer.score() in a thread pool.
|
||||
2. Constructs a minimal EvaluationResult + Scenario in the standard format.
|
||||
3. Calls write_run_artifacts() — produces metadata.json, scores.csv, summary.md.
|
||||
4. Calls run_advisor() — produces optimization_advice.md.
|
||||
|
||||
The resulting run directory lands under outputs/score-async/<run_id>/ and is
|
||||
automatically picked up by run_reader.list_run_summaries(), so it appears in
|
||||
the existing 「运行列表」 and 「报告详情」 pages without any extra wiring.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import math
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from webapp.models import AsyncScoreJobStatus, ScoreRequest
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-async"
|
||||
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-jobs" # lightweight job index
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
class ScoreJobManager:
|
||||
"""Thread-pool manager for async scoring jobs.
|
||||
|
||||
Results are written as standard run artifacts so the report detail page
|
||||
can render them with zero additional code.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_dir: Path = _DEFAULT_JOBS_DIR,
|
||||
index_dir: Path = _DEFAULT_INDEX_DIR,
|
||||
max_workers: int = 4,
|
||||
) -> None:
|
||||
self._output_dir = Path(output_dir)
|
||||
self._index_dir = Path(index_dir)
|
||||
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._index_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||
self._cache: dict[str, AsyncScoreJobStatus] = {}
|
||||
self._lock = threading.Lock()
|
||||
self._load_existing()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Public API
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
|
||||
"""Queue one scoring job and return its initial status immediately."""
|
||||
job_id = uuid.uuid4().hex[:12]
|
||||
status = AsyncScoreJobStatus(
|
||||
job_id=job_id,
|
||||
status="queued",
|
||||
created_at=_now_iso(),
|
||||
request_summary={
|
||||
"question": (request.question or "")[:80],
|
||||
"answer": (request.answer or "")[:80],
|
||||
"metrics": list(request.metrics),
|
||||
"judge_model": request.judge_model or "",
|
||||
"embedding_model": request.embedding_model or "",
|
||||
"has_contexts": bool(request.contexts),
|
||||
"has_ground_truth": bool(request.ground_truth),
|
||||
},
|
||||
)
|
||||
with self._lock:
|
||||
self._cache[job_id] = status
|
||||
self._persist_index(status)
|
||||
self._executor.submit(self._run, job_id, request)
|
||||
return status
|
||||
|
||||
def get(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||
"""Return current status or None if unknown."""
|
||||
with self._lock:
|
||||
return self._cache.get(job_id)
|
||||
|
||||
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||
"""Return all known jobs, newest first."""
|
||||
with self._lock:
|
||||
jobs = list(self._cache.values())
|
||||
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||
return jobs
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Worker
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def _run(self, job_id: str, request: ScoreRequest) -> None:
|
||||
"""Execute scoring, write run artifacts, run advisor."""
|
||||
import logging
|
||||
logger = logging.getLogger("webapp.services.score_job_manager")
|
||||
self._update(job_id, status="running")
|
||||
|
||||
# Lazy imports to keep web server bootable if ragas is not installed.
|
||||
from rag_eval.advisor import run_advisor
|
||||
from rag_eval.metrics.factory import build_models
|
||||
from rag_eval.metrics.weights import compute_weighted_score
|
||||
from rag_eval.reporting.writers import write_run_artifacts
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
from rag_eval.shared.models import (
|
||||
DatasetConfig, EvaluationResult, NormalizedSample,
|
||||
RuntimeConfig, Scenario,
|
||||
)
|
||||
from rag_eval.shared.utils import utc_now_iso
|
||||
from webapp.services.inline_scorer import inline_scorer
|
||||
|
||||
settings = EvaluationSettings()
|
||||
judge_model = request.judge_model or settings.ragas_judge_model
|
||||
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||
effective = request.effective_metrics()
|
||||
requested = set(request.metrics)
|
||||
skipped = sorted(requested - set(effective))
|
||||
|
||||
t0 = time.monotonic()
|
||||
started_at = utc_now_iso()
|
||||
|
||||
try:
|
||||
if effective:
|
||||
raw_scores = inline_scorer.score(
|
||||
question=request.question,
|
||||
answer=request.answer,
|
||||
contexts=request.contexts_as_list(),
|
||||
ground_truth=request.ground_truth,
|
||||
metrics=effective,
|
||||
judge_model=judge_model,
|
||||
embedding_model=embedding_model,
|
||||
settings=settings,
|
||||
)
|
||||
else:
|
||||
raw_scores = {}
|
||||
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
finished_at = utc_now_iso()
|
||||
|
||||
# Build full scores dict (skipped = None)
|
||||
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||
all_scores.update(raw_scores)
|
||||
weighted_raw = compute_weighted_score(
|
||||
{k: v for k, v in raw_scores.items() if v is not None}, {}
|
||||
)
|
||||
weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||
|
||||
# Build a score row compatible with report_builder
|
||||
score_row: dict[str, Any] = {
|
||||
"sample_id": "async-score-1",
|
||||
"question": request.question,
|
||||
"answer": request.answer or "",
|
||||
"contexts": request.contexts or "",
|
||||
"ground_truth": request.ground_truth or "",
|
||||
"error": "",
|
||||
}
|
||||
score_row.update(all_scores)
|
||||
|
||||
# Construct minimal EvaluationResult so write_run_artifacts works
|
||||
run_id = finished_at.replace(":", "-")
|
||||
output_dir = self._output_dir
|
||||
|
||||
# Build a minimal Scenario for snapshot + advisor
|
||||
scenario = Scenario(
|
||||
scenario_name=f"async-score-{job_id}",
|
||||
mode="offline",
|
||||
dataset=DatasetConfig(path=output_dir / run_id / "dataset.csv"),
|
||||
judge_model=judge_model,
|
||||
embedding_model=embedding_model,
|
||||
metrics=list(request.metrics),
|
||||
output_dir=output_dir,
|
||||
optimization_advisor=True, # always generate advice
|
||||
)
|
||||
|
||||
sample = NormalizedSample(
|
||||
sample_id="async-score-1",
|
||||
question=request.question,
|
||||
answer=request.answer or "",
|
||||
contexts=request.contexts_as_list(),
|
||||
ground_truth=request.ground_truth or "",
|
||||
)
|
||||
|
||||
result = EvaluationResult(
|
||||
scenario=scenario,
|
||||
run_id=run_id,
|
||||
started_at=started_at,
|
||||
finished_at=finished_at,
|
||||
valid_samples=[sample],
|
||||
invalid_samples=[],
|
||||
score_rows=[score_row],
|
||||
)
|
||||
|
||||
write_run_artifacts(result)
|
||||
logger.info("[score_job] artifacts written job_id=%s run_id=%s", job_id, run_id)
|
||||
|
||||
# Run optimization advisor (builds optimization_advice.md)
|
||||
try:
|
||||
llm, _ = build_models(judge_model, embedding_model, settings)
|
||||
run_advisor(result, scenario, llm)
|
||||
logger.info("[score_job] advisor done job_id=%s", job_id)
|
||||
except Exception as adv_exc: # noqa: BLE001
|
||||
logger.warning("[score_job] advisor failed job_id=%s err=%s", job_id, adv_exc)
|
||||
|
||||
self._update(
|
||||
job_id,
|
||||
status="completed",
|
||||
finished_at=finished_at,
|
||||
run_id=run_id,
|
||||
scores=all_scores,
|
||||
weighted_score=weighted,
|
||||
latency_ms=latency_ms,
|
||||
skipped_metrics=skipped,
|
||||
)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
logger.error("[score_job] failed job_id=%s err=%s", job_id, exc)
|
||||
self._update(
|
||||
job_id,
|
||||
status="failed",
|
||||
finished_at=_now_iso(),
|
||||
latency_ms=latency_ms,
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Persistence helpers
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def _update(self, job_id: str, **kwargs: Any) -> None:
|
||||
"""Merge kwargs into the job status and persist the index."""
|
||||
with self._lock:
|
||||
existing = self._cache.get(job_id)
|
||||
if existing is None:
|
||||
return
|
||||
updated = existing.model_copy(update=kwargs)
|
||||
self._cache[job_id] = updated
|
||||
self._persist_index(updated)
|
||||
|
||||
def _persist_index(self, status: AsyncScoreJobStatus) -> None:
|
||||
"""Write a lightweight index JSON for this job (survives restarts)."""
|
||||
path = self._index_dir / f"{status.job_id}.json"
|
||||
path.write_text(
|
||||
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def _load_existing(self) -> None:
|
||||
"""Load existing job index files on startup."""
|
||||
for path in sorted(self._index_dir.glob("*.json")):
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
status = AsyncScoreJobStatus.model_validate(data)
|
||||
self._cache[status.job_id] = status
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
# Module-level singleton shared by FastAPI routes.
|
||||
score_job_manager = ScoreJobManager()
|
||||
Reference in New Issue
Block a user