- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing
chore(weighted-score): comment out 综合加权得分 display and computation
- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
453 lines
18 KiB
Python
453 lines
18 KiB
Python
"""Background task manager for session-grouped async RAGAS scoring.
|
|
|
|
Each session groups multiple scoring calls into one shared run report:
|
|
|
|
1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
|
|
2. Every call: appends a new sample row to scores.csv, rewrites summary.md
|
|
and optimization_advice.md by re-running write_run_artifacts + run_advisor
|
|
over ALL accumulated rows.
|
|
3. The resulting run directory is picked up automatically by run_reader, so the
|
|
「运行列表」 and 「报告详情」 pages show the live, growing report.
|
|
|
|
Concurrency model:
|
|
- Scoring (LLM network I/O) runs freely in the thread pool — different sessions
|
|
score concurrently; multiple calls to the same session also start scoring in
|
|
parallel.
|
|
- File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
|
|
a per-session threading.Lock, so no two calls corrupt the same session's CSV.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
|
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
|
|
|
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
|
|
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
|
|
|
|
# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
|
|
_NON_METRIC_COLUMNS = {
|
|
"sample_id", "question", "contexts", "answer", "ground_truth",
|
|
"scenario", "language", "retrieval_config", "error",
|
|
"judge_model", "embedding_model", "run_id", "difficulty",
|
|
"question_type", "doc_id", "doc_name", "section_path",
|
|
"page_start", "page_end", "source_chunk_ids", "review_status",
|
|
"review_notes", "weighted_score", "sample_weight",
|
|
}
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def _sanitize_session_id(session_id: str) -> str:
|
|
"""Convert an arbitrary session_id string to a safe directory-name fragment."""
|
|
return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
|
|
|
|
|
|
class SessionScoreJobManager:
|
|
"""Thread-pool manager for session-grouped async scoring jobs.
|
|
|
|
All calls sharing a session_id append to one shared run directory, so the
|
|
report detail page shows all samples and their aggregate metrics together.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
output_dir: Path = _DEFAULT_OUTPUT_DIR,
|
|
index_dir: Path = _DEFAULT_INDEX_DIR,
|
|
max_workers: int = 4,
|
|
) -> None:
|
|
self._output_dir = Path(output_dir)
|
|
self._index_dir = Path(index_dir)
|
|
self._output_dir.mkdir(parents=True, exist_ok=True)
|
|
self._index_dir.mkdir(parents=True, exist_ok=True)
|
|
(self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
|
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
|
|
# job_id -> AsyncScoreJobStatus; guarded by _lock
|
|
self._job_cache: dict[str, AsyncScoreJobStatus] = {}
|
|
# session_id -> [job_ids in order]; guarded by _lock
|
|
self._session_jobs: dict[str, list[str]] = {}
|
|
# session_id -> per-session threading.Lock; guarded by _lock
|
|
self._session_locks: dict[str, threading.Lock] = {}
|
|
self._lock = threading.Lock()
|
|
|
|
self._load_existing()
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Public API
|
|
# ------------------------------------------------------------------ #
|
|
|
|
def session_run_id(self, session_id: str) -> str:
|
|
"""Return the deterministic run_id for a session (also the dir name)."""
|
|
return f"session-{_sanitize_session_id(session_id)}"
|
|
|
|
def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
|
|
"""Queue one scoring call for a session.
|
|
|
|
Returns (job_status, run_id). run_id is deterministic from session_id.
|
|
"""
|
|
run_id = self.session_run_id(session_id)
|
|
job_id = uuid.uuid4().hex[:12]
|
|
|
|
status = AsyncScoreJobStatus(
|
|
job_id=job_id,
|
|
status="queued",
|
|
created_at=_now_iso(),
|
|
request_summary={
|
|
"question": (request.question or "")[:80],
|
|
"answer": (request.answer or "")[:80],
|
|
"metrics": list(request.metrics),
|
|
"judge_model": request.judge_model or "",
|
|
"embedding_model": request.embedding_model or "",
|
|
"has_contexts": bool(request.contexts),
|
|
"has_ground_truth": bool(request.ground_truth),
|
|
"session_id": session_id,
|
|
},
|
|
)
|
|
|
|
with self._lock:
|
|
self._job_cache[job_id] = status
|
|
if session_id not in self._session_jobs:
|
|
self._session_jobs[session_id] = []
|
|
self._session_jobs[session_id].append(job_id)
|
|
|
|
self._persist_job_index(status)
|
|
self._persist_session_index(session_id)
|
|
self._executor.submit(self._run, job_id, session_id, run_id, request)
|
|
return status, run_id
|
|
|
|
def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
|
|
"""Return current status of one call, or None if unknown."""
|
|
with self._lock:
|
|
return self._job_cache.get(job_id)
|
|
|
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
|
"""Return all session job records, newest first."""
|
|
with self._lock:
|
|
jobs = list(self._job_cache.values())
|
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
|
return jobs
|
|
|
|
def get_session(self, session_id: str) -> SessionStatus | None:
|
|
"""Return aggregate status for a session, or None if unknown."""
|
|
with self._lock:
|
|
job_ids = list(self._session_jobs.get(session_id) or [])
|
|
if not job_ids:
|
|
return None
|
|
|
|
run_id = self.session_run_id(session_id)
|
|
run_dir = self._output_dir / run_id
|
|
|
|
# Compute live metric means from the CSV (may be mid-update — best effort)
|
|
metric_means = self._read_metric_means(run_dir)
|
|
|
|
with self._lock:
|
|
jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
|
|
|
|
latest = max((j.finished_at for j in jobs if j.finished_at), default="")
|
|
return SessionStatus(
|
|
session_id=session_id,
|
|
run_id=run_id,
|
|
call_count=len(job_ids),
|
|
metric_means=metric_means,
|
|
latest_finished_at=latest,
|
|
jobs=sorted(jobs, key=lambda j: j.created_at),
|
|
)
|
|
|
|
def list_sessions(self) -> list[SessionStatus]:
|
|
"""Return aggregate status for all known sessions."""
|
|
with self._lock:
|
|
session_ids = list(self._session_jobs.keys())
|
|
results = []
|
|
for sid in session_ids:
|
|
status = self.get_session(sid)
|
|
if status is not None:
|
|
results.append(status)
|
|
results.sort(key=lambda s: s.latest_finished_at, reverse=True)
|
|
return results
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Worker
|
|
# ------------------------------------------------------------------ #
|
|
|
|
def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
|
|
"""Score one sample then append it to the session's shared run artifacts."""
|
|
import logging
|
|
logger = logging.getLogger("webapp.services.session_score_manager")
|
|
self._update_job(job_id, status="running")
|
|
|
|
# Lazy imports — keep web server bootable if ragas is not installed.
|
|
from rag_eval.advisor import run_advisor
|
|
from rag_eval.metrics.factory import build_models
|
|
from rag_eval.metrics.weights import compute_weighted_score
|
|
from rag_eval.reporting.writers import write_run_artifacts
|
|
from rag_eval.settings import EvaluationSettings
|
|
from rag_eval.shared.models import (
|
|
DatasetConfig, EvaluationResult, NormalizedSample,
|
|
RuntimeConfig, Scenario,
|
|
)
|
|
from rag_eval.shared.utils import utc_now_iso
|
|
from webapp.services.inline_scorer import inline_scorer
|
|
|
|
settings = EvaluationSettings()
|
|
judge_model = request.judge_model or settings.ragas_judge_model
|
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
|
effective = request.effective_metrics()
|
|
requested = set(request.metrics)
|
|
skipped = sorted(requested - set(effective))
|
|
|
|
t0 = time.monotonic()
|
|
|
|
try:
|
|
# --- Scoring (can run concurrently for the same session) ----------
|
|
if effective:
|
|
raw_scores = inline_scorer.score(
|
|
question=request.question,
|
|
answer=request.answer,
|
|
contexts=request.contexts_as_list(),
|
|
ground_truth=request.ground_truth,
|
|
metrics=effective,
|
|
judge_model=judge_model,
|
|
embedding_model=embedding_model,
|
|
settings=settings,
|
|
)
|
|
else:
|
|
raw_scores = {}
|
|
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
finished_at = utc_now_iso()
|
|
|
|
# Build complete scores for this sample (skipped metrics → None)
|
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
|
all_scores.update(raw_scores)
|
|
|
|
# 综合加权得分计算(已暂时禁用)
|
|
# weighted_raw = compute_weighted_score(
|
|
# {k: v for k, v in raw_scores.items() if v is not None}, {}
|
|
# )
|
|
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
|
weighted = None
|
|
|
|
# --- File I/O must be serialized per session ----------------------
|
|
session_lock = self._get_session_lock(session_id)
|
|
with session_lock:
|
|
run_dir = self._output_dir / run_id
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Read all existing rows, then append the new one
|
|
existing_rows = self._read_score_rows(run_dir)
|
|
call_number = len(existing_rows) + 1
|
|
|
|
new_row: dict[str, Any] = {
|
|
"sample_id": f"session-score-{call_number}",
|
|
"question": request.question,
|
|
"answer": request.answer or "",
|
|
"contexts": request.contexts or "",
|
|
"ground_truth": request.ground_truth or "",
|
|
"error": "",
|
|
}
|
|
new_row.update(all_scores)
|
|
|
|
all_rows = existing_rows + [new_row]
|
|
|
|
# Reconstruct NormalizedSample objects for write_run_artifacts metadata
|
|
valid_samples = [
|
|
NormalizedSample(
|
|
sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
|
|
question=str(row.get("question", "")),
|
|
answer=str(row.get("answer", "")),
|
|
contexts=[
|
|
part.strip()
|
|
for part in str(row.get("contexts", "")).split(" |||| ")
|
|
if part.strip()
|
|
],
|
|
ground_truth=str(row.get("ground_truth", "")),
|
|
)
|
|
for i, row in enumerate(all_rows)
|
|
]
|
|
|
|
# Determine all metric columns (union of all rows' metric keys)
|
|
all_metric_names = sorted({
|
|
k for row in all_rows
|
|
for k in row if k not in _NON_METRIC_COLUMNS
|
|
})
|
|
|
|
scenario = Scenario(
|
|
scenario_name=f"session-{_sanitize_session_id(session_id)}",
|
|
mode="offline",
|
|
dataset=DatasetConfig(path=run_dir / "dataset.csv"),
|
|
judge_model=judge_model,
|
|
embedding_model=embedding_model,
|
|
metrics=all_metric_names,
|
|
output_dir=self._output_dir,
|
|
optimization_advisor=True,
|
|
)
|
|
|
|
started_at_val = (
|
|
existing_rows[0].get("_started_at", finished_at)
|
|
if existing_rows else finished_at
|
|
)
|
|
|
|
result = EvaluationResult(
|
|
scenario=scenario,
|
|
run_id=run_id,
|
|
started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
|
|
finished_at=finished_at,
|
|
valid_samples=valid_samples,
|
|
invalid_samples=[],
|
|
score_rows=all_rows,
|
|
)
|
|
|
|
write_run_artifacts(result)
|
|
logger.info(
|
|
"[session_job] artifacts written job_id=%s session_id=%s call=%d",
|
|
job_id, session_id, call_number,
|
|
)
|
|
|
|
# Regenerate optimization advice over all accumulated rows
|
|
try:
|
|
llm, _ = build_models(judge_model, embedding_model, settings)
|
|
run_advisor(result, scenario, llm)
|
|
logger.info("[session_job] advisor done job_id=%s session=%s", job_id, session_id)
|
|
except Exception as adv_exc: # noqa: BLE001
|
|
logger.warning(
|
|
"[session_job] advisor failed job_id=%s err=%s", job_id, adv_exc
|
|
)
|
|
|
|
self._update_job(
|
|
job_id,
|
|
status="completed",
|
|
finished_at=finished_at,
|
|
run_id=run_id,
|
|
scores=all_scores,
|
|
weighted_score=weighted,
|
|
latency_ms=latency_ms,
|
|
skipped_metrics=skipped,
|
|
)
|
|
self._persist_session_index(session_id)
|
|
|
|
except Exception as exc: # noqa: BLE001
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
import logging as _logging
|
|
_logging.getLogger("webapp.services.session_score_manager").error(
|
|
"[session_job] failed job_id=%s err=%s", job_id, exc
|
|
)
|
|
self._update_job(
|
|
job_id,
|
|
status="failed",
|
|
finished_at=_now_iso(),
|
|
latency_ms=latency_ms,
|
|
error=f"{type(exc).__name__}: {exc}",
|
|
)
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Helpers
|
|
# ------------------------------------------------------------------ #
|
|
|
|
def _get_session_lock(self, session_id: str) -> threading.Lock:
|
|
with self._lock:
|
|
if session_id not in self._session_locks:
|
|
self._session_locks[session_id] = threading.Lock()
|
|
return self._session_locks[session_id]
|
|
|
|
def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
|
|
"""Read existing scores.csv rows, returning empty list if file doesn't exist."""
|
|
scores_path = run_dir / "scores.csv"
|
|
if not scores_path.is_file():
|
|
return []
|
|
try:
|
|
frame = pd.read_csv(scores_path)
|
|
return frame.where(pd.notnull(frame), None).to_dict("records")
|
|
except (OSError, ValueError):
|
|
return []
|
|
|
|
def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
|
|
"""Compute per-metric means from the session's scores.csv."""
|
|
scores_path = run_dir / "scores.csv"
|
|
if not scores_path.is_file():
|
|
return {}
|
|
try:
|
|
frame = pd.read_csv(scores_path)
|
|
except (OSError, ValueError):
|
|
return {}
|
|
means: dict[str, float | None] = {}
|
|
for col in frame.columns:
|
|
if col in _NON_METRIC_COLUMNS:
|
|
continue
|
|
if pd.api.types.is_numeric_dtype(frame[col]):
|
|
val = frame[col].mean(numeric_only=True)
|
|
means[col] = None if pd.isna(val) else round(float(val), 4)
|
|
return means
|
|
|
|
def _update_job(self, job_id: str, **kwargs: Any) -> None:
|
|
with self._lock:
|
|
existing = self._job_cache.get(job_id)
|
|
if existing is None:
|
|
return
|
|
updated = existing.model_copy(update=kwargs)
|
|
self._job_cache[job_id] = updated
|
|
self._persist_job_index(updated)
|
|
|
|
def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
|
|
"""Persist a single job's status to the index directory."""
|
|
path = self._index_dir / f"{status.job_id}.json"
|
|
path.write_text(
|
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
def _persist_session_index(self, session_id: str) -> None:
|
|
"""Persist the session→job_ids mapping."""
|
|
with self._lock:
|
|
job_ids = list(self._session_jobs.get(session_id) or [])
|
|
run_id = self.session_run_id(session_id)
|
|
data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
|
|
path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
|
|
path.write_text(
|
|
json.dumps(data, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
def _load_existing(self) -> None:
|
|
"""Restore job cache and session mappings from persisted index files on startup."""
|
|
# Load individual job files
|
|
for path in sorted(self._index_dir.glob("*.json")):
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
status = AsyncScoreJobStatus.model_validate(data)
|
|
self._job_cache[status.job_id] = status
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
|
|
# Load session→job_ids mappings
|
|
sessions_dir = self._index_dir / "_sessions"
|
|
if not sessions_dir.is_dir():
|
|
return
|
|
for path in sorted(sessions_dir.glob("*.json")):
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
sid = data.get("session_id", "")
|
|
job_ids = data.get("job_ids", [])
|
|
if sid:
|
|
self._session_jobs[sid] = job_ids
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
|
|
|
|
# Module-level singleton shared by FastAPI routes.
|
|
session_score_manager = SessionScoreJobManager()
|