feat(session-async): add /api/score/session_async with incremental session report aggregation
- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing
chore(weighted-score): comment out 综合加权得分 display and computation
- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
452
webapp/services/session_score_manager.py
Normal file
452
webapp/services/session_score_manager.py
Normal file
@@ -0,0 +1,452 @@
|
||||
"""Background task manager for session-grouped async RAGAS scoring.
|
||||
|
||||
Each session groups multiple scoring calls into one shared run report:
|
||||
|
||||
1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
|
||||
2. Every call: appends a new sample row to scores.csv, rewrites summary.md
|
||||
and optimization_advice.md by re-running write_run_artifacts + run_advisor
|
||||
over ALL accumulated rows.
|
||||
3. The resulting run directory is picked up automatically by run_reader, so the
|
||||
「运行列表」 and 「报告详情」 pages show the live, growing report.
|
||||
|
||||
Concurrency model:
|
||||
- Scoring (LLM network I/O) runs freely in the thread pool — different sessions
|
||||
score concurrently; multiple calls to the same session also start scoring in
|
||||
parallel.
|
||||
- File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
|
||||
a per-session threading.Lock, so no two calls corrupt the same session's CSV.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
|
||||
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
|
||||
|
||||
# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
|
||||
_NON_METRIC_COLUMNS = {
|
||||
"sample_id", "question", "contexts", "answer", "ground_truth",
|
||||
"scenario", "language", "retrieval_config", "error",
|
||||
"judge_model", "embedding_model", "run_id", "difficulty",
|
||||
"question_type", "doc_id", "doc_name", "section_path",
|
||||
"page_start", "page_end", "source_chunk_ids", "review_status",
|
||||
"review_notes", "weighted_score", "sample_weight",
|
||||
}
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _sanitize_session_id(session_id: str) -> str:
|
||||
"""Convert an arbitrary session_id string to a safe directory-name fragment."""
|
||||
return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
|
||||
|
||||
|
||||
class SessionScoreJobManager:
|
||||
"""Thread-pool manager for session-grouped async scoring jobs.
|
||||
|
||||
All calls sharing a session_id append to one shared run directory, so the
|
||||
report detail page shows all samples and their aggregate metrics together.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
output_dir: Path = _DEFAULT_OUTPUT_DIR,
|
||||
index_dir: Path = _DEFAULT_INDEX_DIR,
|
||||
max_workers: int = 4,
|
||||
) -> None:
|
||||
self._output_dir = Path(output_dir)
|
||||
self._index_dir = Path(index_dir)
|
||||
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._index_dir.mkdir(parents=True, exist_ok=True)
|
||||
(self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
|
||||
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||
|
||||
# job_id -> AsyncScoreJobStatus; guarded by _lock
|
||||
self._job_cache: dict[str, AsyncScoreJobStatus] = {}
|
||||
# session_id -> [job_ids in order]; guarded by _lock
|
||||
self._session_jobs: dict[str, list[str]] = {}
|
||||
# session_id -> per-session threading.Lock; guarded by _lock
|
||||
self._session_locks: dict[str, threading.Lock] = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
self._load_existing()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Public API
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def session_run_id(self, session_id: str) -> str:
|
||||
"""Return the deterministic run_id for a session (also the dir name)."""
|
||||
return f"session-{_sanitize_session_id(session_id)}"
|
||||
|
||||
def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
|
||||
"""Queue one scoring call for a session.
|
||||
|
||||
Returns (job_status, run_id). run_id is deterministic from session_id.
|
||||
"""
|
||||
run_id = self.session_run_id(session_id)
|
||||
job_id = uuid.uuid4().hex[:12]
|
||||
|
||||
status = AsyncScoreJobStatus(
|
||||
job_id=job_id,
|
||||
status="queued",
|
||||
created_at=_now_iso(),
|
||||
request_summary={
|
||||
"question": (request.question or "")[:80],
|
||||
"answer": (request.answer or "")[:80],
|
||||
"metrics": list(request.metrics),
|
||||
"judge_model": request.judge_model or "",
|
||||
"embedding_model": request.embedding_model or "",
|
||||
"has_contexts": bool(request.contexts),
|
||||
"has_ground_truth": bool(request.ground_truth),
|
||||
"session_id": session_id,
|
||||
},
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self._job_cache[job_id] = status
|
||||
if session_id not in self._session_jobs:
|
||||
self._session_jobs[session_id] = []
|
||||
self._session_jobs[session_id].append(job_id)
|
||||
|
||||
self._persist_job_index(status)
|
||||
self._persist_session_index(session_id)
|
||||
self._executor.submit(self._run, job_id, session_id, run_id, request)
|
||||
return status, run_id
|
||||
|
||||
def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||
"""Return current status of one call, or None if unknown."""
|
||||
with self._lock:
|
||||
return self._job_cache.get(job_id)
|
||||
|
||||
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||
"""Return all session job records, newest first."""
|
||||
with self._lock:
|
||||
jobs = list(self._job_cache.values())
|
||||
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||
return jobs
|
||||
|
||||
def get_session(self, session_id: str) -> SessionStatus | None:
|
||||
"""Return aggregate status for a session, or None if unknown."""
|
||||
with self._lock:
|
||||
job_ids = list(self._session_jobs.get(session_id) or [])
|
||||
if not job_ids:
|
||||
return None
|
||||
|
||||
run_id = self.session_run_id(session_id)
|
||||
run_dir = self._output_dir / run_id
|
||||
|
||||
# Compute live metric means from the CSV (may be mid-update — best effort)
|
||||
metric_means = self._read_metric_means(run_dir)
|
||||
|
||||
with self._lock:
|
||||
jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
|
||||
|
||||
latest = max((j.finished_at for j in jobs if j.finished_at), default="")
|
||||
return SessionStatus(
|
||||
session_id=session_id,
|
||||
run_id=run_id,
|
||||
call_count=len(job_ids),
|
||||
metric_means=metric_means,
|
||||
latest_finished_at=latest,
|
||||
jobs=sorted(jobs, key=lambda j: j.created_at),
|
||||
)
|
||||
|
||||
def list_sessions(self) -> list[SessionStatus]:
|
||||
"""Return aggregate status for all known sessions."""
|
||||
with self._lock:
|
||||
session_ids = list(self._session_jobs.keys())
|
||||
results = []
|
||||
for sid in session_ids:
|
||||
status = self.get_session(sid)
|
||||
if status is not None:
|
||||
results.append(status)
|
||||
results.sort(key=lambda s: s.latest_finished_at, reverse=True)
|
||||
return results
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Worker
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
|
||||
"""Score one sample then append it to the session's shared run artifacts."""
|
||||
import logging
|
||||
logger = logging.getLogger("webapp.services.session_score_manager")
|
||||
self._update_job(job_id, status="running")
|
||||
|
||||
# Lazy imports — keep web server bootable if ragas is not installed.
|
||||
from rag_eval.advisor import run_advisor
|
||||
from rag_eval.metrics.factory import build_models
|
||||
from rag_eval.metrics.weights import compute_weighted_score
|
||||
from rag_eval.reporting.writers import write_run_artifacts
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
from rag_eval.shared.models import (
|
||||
DatasetConfig, EvaluationResult, NormalizedSample,
|
||||
RuntimeConfig, Scenario,
|
||||
)
|
||||
from rag_eval.shared.utils import utc_now_iso
|
||||
from webapp.services.inline_scorer import inline_scorer
|
||||
|
||||
settings = EvaluationSettings()
|
||||
judge_model = request.judge_model or settings.ragas_judge_model
|
||||
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||
effective = request.effective_metrics()
|
||||
requested = set(request.metrics)
|
||||
skipped = sorted(requested - set(effective))
|
||||
|
||||
t0 = time.monotonic()
|
||||
|
||||
try:
|
||||
# --- Scoring (can run concurrently for the same session) ----------
|
||||
if effective:
|
||||
raw_scores = inline_scorer.score(
|
||||
question=request.question,
|
||||
answer=request.answer,
|
||||
contexts=request.contexts_as_list(),
|
||||
ground_truth=request.ground_truth,
|
||||
metrics=effective,
|
||||
judge_model=judge_model,
|
||||
embedding_model=embedding_model,
|
||||
settings=settings,
|
||||
)
|
||||
else:
|
||||
raw_scores = {}
|
||||
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
finished_at = utc_now_iso()
|
||||
|
||||
# Build complete scores for this sample (skipped metrics → None)
|
||||
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||
all_scores.update(raw_scores)
|
||||
|
||||
# 综合加权得分计算(已暂时禁用)
|
||||
# weighted_raw = compute_weighted_score(
|
||||
# {k: v for k, v in raw_scores.items() if v is not None}, {}
|
||||
# )
|
||||
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||
weighted = None
|
||||
|
||||
# --- File I/O must be serialized per session ----------------------
|
||||
session_lock = self._get_session_lock(session_id)
|
||||
with session_lock:
|
||||
run_dir = self._output_dir / run_id
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read all existing rows, then append the new one
|
||||
existing_rows = self._read_score_rows(run_dir)
|
||||
call_number = len(existing_rows) + 1
|
||||
|
||||
new_row: dict[str, Any] = {
|
||||
"sample_id": f"session-score-{call_number}",
|
||||
"question": request.question,
|
||||
"answer": request.answer or "",
|
||||
"contexts": request.contexts or "",
|
||||
"ground_truth": request.ground_truth or "",
|
||||
"error": "",
|
||||
}
|
||||
new_row.update(all_scores)
|
||||
|
||||
all_rows = existing_rows + [new_row]
|
||||
|
||||
# Reconstruct NormalizedSample objects for write_run_artifacts metadata
|
||||
valid_samples = [
|
||||
NormalizedSample(
|
||||
sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
|
||||
question=str(row.get("question", "")),
|
||||
answer=str(row.get("answer", "")),
|
||||
contexts=[
|
||||
part.strip()
|
||||
for part in str(row.get("contexts", "")).split(" |||| ")
|
||||
if part.strip()
|
||||
],
|
||||
ground_truth=str(row.get("ground_truth", "")),
|
||||
)
|
||||
for i, row in enumerate(all_rows)
|
||||
]
|
||||
|
||||
# Determine all metric columns (union of all rows' metric keys)
|
||||
all_metric_names = sorted({
|
||||
k for row in all_rows
|
||||
for k in row if k not in _NON_METRIC_COLUMNS
|
||||
})
|
||||
|
||||
scenario = Scenario(
|
||||
scenario_name=f"session-{_sanitize_session_id(session_id)}",
|
||||
mode="offline",
|
||||
dataset=DatasetConfig(path=run_dir / "dataset.csv"),
|
||||
judge_model=judge_model,
|
||||
embedding_model=embedding_model,
|
||||
metrics=all_metric_names,
|
||||
output_dir=self._output_dir,
|
||||
optimization_advisor=True,
|
||||
)
|
||||
|
||||
started_at_val = (
|
||||
existing_rows[0].get("_started_at", finished_at)
|
||||
if existing_rows else finished_at
|
||||
)
|
||||
|
||||
result = EvaluationResult(
|
||||
scenario=scenario,
|
||||
run_id=run_id,
|
||||
started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
|
||||
finished_at=finished_at,
|
||||
valid_samples=valid_samples,
|
||||
invalid_samples=[],
|
||||
score_rows=all_rows,
|
||||
)
|
||||
|
||||
write_run_artifacts(result)
|
||||
logger.info(
|
||||
"[session_job] artifacts written job_id=%s session_id=%s call=%d",
|
||||
job_id, session_id, call_number,
|
||||
)
|
||||
|
||||
# Regenerate optimization advice over all accumulated rows
|
||||
try:
|
||||
llm, _ = build_models(judge_model, embedding_model, settings)
|
||||
run_advisor(result, scenario, llm)
|
||||
logger.info("[session_job] advisor done job_id=%s session=%s", job_id, session_id)
|
||||
except Exception as adv_exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"[session_job] advisor failed job_id=%s err=%s", job_id, adv_exc
|
||||
)
|
||||
|
||||
self._update_job(
|
||||
job_id,
|
||||
status="completed",
|
||||
finished_at=finished_at,
|
||||
run_id=run_id,
|
||||
scores=all_scores,
|
||||
weighted_score=weighted,
|
||||
latency_ms=latency_ms,
|
||||
skipped_metrics=skipped,
|
||||
)
|
||||
self._persist_session_index(session_id)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
import logging as _logging
|
||||
_logging.getLogger("webapp.services.session_score_manager").error(
|
||||
"[session_job] failed job_id=%s err=%s", job_id, exc
|
||||
)
|
||||
self._update_job(
|
||||
job_id,
|
||||
status="failed",
|
||||
finished_at=_now_iso(),
|
||||
latency_ms=latency_ms,
|
||||
error=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Helpers
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def _get_session_lock(self, session_id: str) -> threading.Lock:
|
||||
with self._lock:
|
||||
if session_id not in self._session_locks:
|
||||
self._session_locks[session_id] = threading.Lock()
|
||||
return self._session_locks[session_id]
|
||||
|
||||
def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
|
||||
"""Read existing scores.csv rows, returning empty list if file doesn't exist."""
|
||||
scores_path = run_dir / "scores.csv"
|
||||
if not scores_path.is_file():
|
||||
return []
|
||||
try:
|
||||
frame = pd.read_csv(scores_path)
|
||||
return frame.where(pd.notnull(frame), None).to_dict("records")
|
||||
except (OSError, ValueError):
|
||||
return []
|
||||
|
||||
def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
|
||||
"""Compute per-metric means from the session's scores.csv."""
|
||||
scores_path = run_dir / "scores.csv"
|
||||
if not scores_path.is_file():
|
||||
return {}
|
||||
try:
|
||||
frame = pd.read_csv(scores_path)
|
||||
except (OSError, ValueError):
|
||||
return {}
|
||||
means: dict[str, float | None] = {}
|
||||
for col in frame.columns:
|
||||
if col in _NON_METRIC_COLUMNS:
|
||||
continue
|
||||
if pd.api.types.is_numeric_dtype(frame[col]):
|
||||
val = frame[col].mean(numeric_only=True)
|
||||
means[col] = None if pd.isna(val) else round(float(val), 4)
|
||||
return means
|
||||
|
||||
def _update_job(self, job_id: str, **kwargs: Any) -> None:
|
||||
with self._lock:
|
||||
existing = self._job_cache.get(job_id)
|
||||
if existing is None:
|
||||
return
|
||||
updated = existing.model_copy(update=kwargs)
|
||||
self._job_cache[job_id] = updated
|
||||
self._persist_job_index(updated)
|
||||
|
||||
def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
|
||||
"""Persist a single job's status to the index directory."""
|
||||
path = self._index_dir / f"{status.job_id}.json"
|
||||
path.write_text(
|
||||
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def _persist_session_index(self, session_id: str) -> None:
|
||||
"""Persist the session→job_ids mapping."""
|
||||
with self._lock:
|
||||
job_ids = list(self._session_jobs.get(session_id) or [])
|
||||
run_id = self.session_run_id(session_id)
|
||||
data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
|
||||
path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
|
||||
path.write_text(
|
||||
json.dumps(data, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def _load_existing(self) -> None:
|
||||
"""Restore job cache and session mappings from persisted index files on startup."""
|
||||
# Load individual job files
|
||||
for path in sorted(self._index_dir.glob("*.json")):
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
status = AsyncScoreJobStatus.model_validate(data)
|
||||
self._job_cache[status.job_id] = status
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
# Load session→job_ids mappings
|
||||
sessions_dir = self._index_dir / "_sessions"
|
||||
if not sessions_dir.is_dir():
|
||||
return
|
||||
for path in sorted(sessions_dir.glob("*.json")):
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
sid = data.get("session_id", "")
|
||||
job_ids = data.get("job_ids", [])
|
||||
if sid:
|
||||
self._session_jobs[sid] = job_ids
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
# Module-level singleton shared by FastAPI routes.
|
||||
session_score_manager = SessionScoreJobManager()
|
||||
Reference in New Issue
Block a user