Files
siemens_ragas/webapp/services/session_score_manager.py

453 lines
18 KiB
Python
Raw Permalink Normal View History

"""Background task manager for session-grouped async RAGAS scoring.
Each session groups multiple scoring calls into one shared run report:
1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
2. Every call: appends a new sample row to scores.csv, rewrites summary.md
and optimization_advice.md by re-running write_run_artifacts + run_advisor
over ALL accumulated rows.
3. The resulting run directory is picked up automatically by run_reader, so the
运行列表 and 报告详情 pages show the live, growing report.
Concurrency model:
- Scoring (LLM network I/O) runs freely in the thread pool different sessions
score concurrently; multiple calls to the same session also start scoring in
parallel.
- File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
a per-session threading.Lock, so no two calls corrupt the same session's CSV.
"""
from __future__ import annotations
import json
import re
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import pandas as pd
from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
_REPO_ROOT = Path(__file__).resolve().parents[2]
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
_NON_METRIC_COLUMNS = {
"sample_id", "question", "contexts", "answer", "ground_truth",
"scenario", "language", "retrieval_config", "error",
"judge_model", "embedding_model", "run_id", "difficulty",
"question_type", "doc_id", "doc_name", "section_path",
"page_start", "page_end", "source_chunk_ids", "review_status",
"review_notes", "weighted_score", "sample_weight",
}
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _sanitize_session_id(session_id: str) -> str:
"""Convert an arbitrary session_id string to a safe directory-name fragment."""
return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
class SessionScoreJobManager:
"""Thread-pool manager for session-grouped async scoring jobs.
All calls sharing a session_id append to one shared run directory, so the
report detail page shows all samples and their aggregate metrics together.
"""
def __init__(
self,
output_dir: Path = _DEFAULT_OUTPUT_DIR,
index_dir: Path = _DEFAULT_INDEX_DIR,
max_workers: int = 4,
) -> None:
self._output_dir = Path(output_dir)
self._index_dir = Path(index_dir)
self._output_dir.mkdir(parents=True, exist_ok=True)
self._index_dir.mkdir(parents=True, exist_ok=True)
(self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
self._executor = ThreadPoolExecutor(max_workers=max_workers)
# job_id -> AsyncScoreJobStatus; guarded by _lock
self._job_cache: dict[str, AsyncScoreJobStatus] = {}
# session_id -> [job_ids in order]; guarded by _lock
self._session_jobs: dict[str, list[str]] = {}
# session_id -> per-session threading.Lock; guarded by _lock
self._session_locks: dict[str, threading.Lock] = {}
self._lock = threading.Lock()
self._load_existing()
# ------------------------------------------------------------------ #
# Public API
# ------------------------------------------------------------------ #
def session_run_id(self, session_id: str) -> str:
"""Return the deterministic run_id for a session (also the dir name)."""
return f"session-{_sanitize_session_id(session_id)}"
def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
"""Queue one scoring call for a session.
Returns (job_status, run_id). run_id is deterministic from session_id.
"""
run_id = self.session_run_id(session_id)
job_id = uuid.uuid4().hex[:12]
status = AsyncScoreJobStatus(
job_id=job_id,
status="queued",
created_at=_now_iso(),
request_summary={
"question": (request.question or "")[:80],
"answer": (request.answer or "")[:80],
"metrics": list(request.metrics),
"judge_model": request.judge_model or "",
"embedding_model": request.embedding_model or "",
"has_contexts": bool(request.contexts),
"has_ground_truth": bool(request.ground_truth),
"session_id": session_id,
},
)
with self._lock:
self._job_cache[job_id] = status
if session_id not in self._session_jobs:
self._session_jobs[session_id] = []
self._session_jobs[session_id].append(job_id)
self._persist_job_index(status)
self._persist_session_index(session_id)
self._executor.submit(self._run, job_id, session_id, run_id, request)
return status, run_id
def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
"""Return current status of one call, or None if unknown."""
with self._lock:
return self._job_cache.get(job_id)
def list_jobs(self) -> list[AsyncScoreJobStatus]:
"""Return all session job records, newest first."""
with self._lock:
jobs = list(self._job_cache.values())
jobs.sort(key=lambda j: j.created_at, reverse=True)
return jobs
def get_session(self, session_id: str) -> SessionStatus | None:
"""Return aggregate status for a session, or None if unknown."""
with self._lock:
job_ids = list(self._session_jobs.get(session_id) or [])
if not job_ids:
return None
run_id = self.session_run_id(session_id)
run_dir = self._output_dir / run_id
# Compute live metric means from the CSV (may be mid-update — best effort)
metric_means = self._read_metric_means(run_dir)
with self._lock:
jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
latest = max((j.finished_at for j in jobs if j.finished_at), default="")
return SessionStatus(
session_id=session_id,
run_id=run_id,
call_count=len(job_ids),
metric_means=metric_means,
latest_finished_at=latest,
jobs=sorted(jobs, key=lambda j: j.created_at),
)
def list_sessions(self) -> list[SessionStatus]:
"""Return aggregate status for all known sessions."""
with self._lock:
session_ids = list(self._session_jobs.keys())
results = []
for sid in session_ids:
status = self.get_session(sid)
if status is not None:
results.append(status)
results.sort(key=lambda s: s.latest_finished_at, reverse=True)
return results
# ------------------------------------------------------------------ #
# Worker
# ------------------------------------------------------------------ #
def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
"""Score one sample then append it to the session's shared run artifacts."""
import logging
logger = logging.getLogger("webapp.services.session_score_manager")
self._update_job(job_id, status="running")
# Lazy imports — keep web server bootable if ragas is not installed.
from rag_eval.advisor import run_advisor
from rag_eval.metrics.factory import build_models
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.reporting.writers import write_run_artifacts
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import (
DatasetConfig, EvaluationResult, NormalizedSample,
RuntimeConfig, Scenario,
)
from rag_eval.shared.utils import utc_now_iso
from webapp.services.inline_scorer import inline_scorer
settings = EvaluationSettings()
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
t0 = time.monotonic()
try:
# --- Scoring (can run concurrently for the same session) ----------
if effective:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
else:
raw_scores = {}
latency_ms = int((time.monotonic() - t0) * 1000)
finished_at = utc_now_iso()
# Build complete scores for this sample (skipped metrics → None)
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
all_scores.update(raw_scores)
# 综合加权得分计算(已暂时禁用)
# weighted_raw = compute_weighted_score(
# {k: v for k, v in raw_scores.items() if v is not None}, {}
# )
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
weighted = None
# --- File I/O must be serialized per session ----------------------
session_lock = self._get_session_lock(session_id)
with session_lock:
run_dir = self._output_dir / run_id
run_dir.mkdir(parents=True, exist_ok=True)
# Read all existing rows, then append the new one
existing_rows = self._read_score_rows(run_dir)
call_number = len(existing_rows) + 1
new_row: dict[str, Any] = {
"sample_id": f"session-score-{call_number}",
"question": request.question,
"answer": request.answer or "",
"contexts": request.contexts or "",
"ground_truth": request.ground_truth or "",
"error": "",
}
new_row.update(all_scores)
all_rows = existing_rows + [new_row]
# Reconstruct NormalizedSample objects for write_run_artifacts metadata
valid_samples = [
NormalizedSample(
sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
question=str(row.get("question", "")),
answer=str(row.get("answer", "")),
contexts=[
part.strip()
for part in str(row.get("contexts", "")).split(" |||| ")
if part.strip()
],
ground_truth=str(row.get("ground_truth", "")),
)
for i, row in enumerate(all_rows)
]
# Determine all metric columns (union of all rows' metric keys)
all_metric_names = sorted({
k for row in all_rows
for k in row if k not in _NON_METRIC_COLUMNS
})
scenario = Scenario(
scenario_name=f"session-{_sanitize_session_id(session_id)}",
mode="offline",
dataset=DatasetConfig(path=run_dir / "dataset.csv"),
judge_model=judge_model,
embedding_model=embedding_model,
metrics=all_metric_names,
output_dir=self._output_dir,
optimization_advisor=True,
)
started_at_val = (
existing_rows[0].get("_started_at", finished_at)
if existing_rows else finished_at
)
result = EvaluationResult(
scenario=scenario,
run_id=run_id,
started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
finished_at=finished_at,
valid_samples=valid_samples,
invalid_samples=[],
score_rows=all_rows,
)
write_run_artifacts(result)
logger.info(
"[session_job] artifacts written job_id=%s session_id=%s call=%d",
job_id, session_id, call_number,
)
# Regenerate optimization advice over all accumulated rows
try:
llm, _ = build_models(judge_model, embedding_model, settings)
run_advisor(result, scenario, llm)
logger.info("[session_job] advisor done job_id=%s session=%s", job_id, session_id)
except Exception as adv_exc: # noqa: BLE001
logger.warning(
"[session_job] advisor failed job_id=%s err=%s", job_id, adv_exc
)
self._update_job(
job_id,
status="completed",
finished_at=finished_at,
run_id=run_id,
scores=all_scores,
weighted_score=weighted,
latency_ms=latency_ms,
skipped_metrics=skipped,
)
self._persist_session_index(session_id)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
import logging as _logging
_logging.getLogger("webapp.services.session_score_manager").error(
"[session_job] failed job_id=%s err=%s", job_id, exc
)
self._update_job(
job_id,
status="failed",
finished_at=_now_iso(),
latency_ms=latency_ms,
error=f"{type(exc).__name__}: {exc}",
)
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
def _get_session_lock(self, session_id: str) -> threading.Lock:
with self._lock:
if session_id not in self._session_locks:
self._session_locks[session_id] = threading.Lock()
return self._session_locks[session_id]
def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
"""Read existing scores.csv rows, returning empty list if file doesn't exist."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return []
try:
frame = pd.read_csv(scores_path)
return frame.where(pd.notnull(frame), None).to_dict("records")
except (OSError, ValueError):
return []
def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
"""Compute per-metric means from the session's scores.csv."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return {}
try:
frame = pd.read_csv(scores_path)
except (OSError, ValueError):
return {}
means: dict[str, float | None] = {}
for col in frame.columns:
if col in _NON_METRIC_COLUMNS:
continue
if pd.api.types.is_numeric_dtype(frame[col]):
val = frame[col].mean(numeric_only=True)
means[col] = None if pd.isna(val) else round(float(val), 4)
return means
def _update_job(self, job_id: str, **kwargs: Any) -> None:
with self._lock:
existing = self._job_cache.get(job_id)
if existing is None:
return
updated = existing.model_copy(update=kwargs)
self._job_cache[job_id] = updated
self._persist_job_index(updated)
def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
"""Persist a single job's status to the index directory."""
path = self._index_dir / f"{status.job_id}.json"
path.write_text(
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _persist_session_index(self, session_id: str) -> None:
"""Persist the session→job_ids mapping."""
with self._lock:
job_ids = list(self._session_jobs.get(session_id) or [])
run_id = self.session_run_id(session_id)
data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _load_existing(self) -> None:
"""Restore job cache and session mappings from persisted index files on startup."""
# Load individual job files
for path in sorted(self._index_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
status = AsyncScoreJobStatus.model_validate(data)
self._job_cache[status.job_id] = status
except Exception: # noqa: BLE001
pass
# Load session→job_ids mappings
sessions_dir = self._index_dir / "_sessions"
if not sessions_dir.is_dir():
return
for path in sorted(sessions_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
sid = data.get("session_id", "")
job_ids = data.get("job_ids", [])
if sid:
self._session_jobs[sid] = job_ids
except Exception: # noqa: BLE001
pass
# Module-level singleton shared by FastAPI routes.
session_score_manager = SessionScoreJobManager()