siemens_ragas/webapp/services/session_score_manager.py

"""Background task manager for session-grouped async RAGAS scoring.

Each session groups multiple scoring calls into one shared run report:

  1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
  2. Every call: appends a new sample row to scores.csv, rewrites summary.md
     and optimization_advice.md by re-running write_run_artifacts + run_advisor
     over ALL accumulated rows.
  3. The resulting run directory is picked up automatically by run_reader, so the
     「运行列表」 and 「报告详情」 pages show the live, growing report.

Concurrency model:
  - Scoring (LLM network I/O) runs freely in the thread pool — different sessions
    score concurrently; multiple calls to the same session also start scoring in
    parallel.
  - File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
    a per-session threading.Lock, so no two calls corrupt the same session's CSV.
"""

from __future__ import annotations

import json
import re
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import pandas as pd

from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus

_REPO_ROOT = Path(__file__).resolve().parents[2]
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"

# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
_NON_METRIC_COLUMNS = {
    "sample_id", "question", "contexts", "answer", "ground_truth",
    "scenario", "language", "retrieval_config", "error",
    "judge_model", "embedding_model", "run_id", "difficulty",
    "question_type", "doc_id", "doc_name", "section_path",
    "page_start", "page_end", "source_chunk_ids", "review_status",
    "review_notes", "weighted_score", "sample_weight",
}


def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


def _sanitize_session_id(session_id: str) -> str:
    """Convert an arbitrary session_id string to a safe directory-name fragment."""
    return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"


class SessionScoreJobManager:
    """Thread-pool manager for session-grouped async scoring jobs.

    All calls sharing a session_id append to one shared run directory, so the
    report detail page shows all samples and their aggregate metrics together.
    """

    def __init__(
        self,
        output_dir: Path = _DEFAULT_OUTPUT_DIR,
        index_dir: Path = _DEFAULT_INDEX_DIR,
        max_workers: int = 4,
    ) -> None:
        self._output_dir = Path(output_dir)
        self._index_dir = Path(index_dir)
        self._output_dir.mkdir(parents=True, exist_ok=True)
        self._index_dir.mkdir(parents=True, exist_ok=True)
        (self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
        self._executor = ThreadPoolExecutor(max_workers=max_workers)

        # job_id -> AsyncScoreJobStatus; guarded by _lock
        self._job_cache: dict[str, AsyncScoreJobStatus] = {}
        # session_id -> [job_ids in order]; guarded by _lock
        self._session_jobs: dict[str, list[str]] = {}
        # session_id -> per-session threading.Lock; guarded by _lock
        self._session_locks: dict[str, threading.Lock] = {}
        self._lock = threading.Lock()

        self._load_existing()

    # ------------------------------------------------------------------ #
    # Public API
    # ------------------------------------------------------------------ #

    def session_run_id(self, session_id: str) -> str:
        """Return the deterministic run_id for a session (also the dir name)."""
        return f"session-{_sanitize_session_id(session_id)}"

    def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
        """Queue one scoring call for a session.

        Returns (job_status, run_id). run_id is deterministic from session_id.
        """
        run_id = self.session_run_id(session_id)
        job_id = uuid.uuid4().hex[:12]

        status = AsyncScoreJobStatus(
            job_id=job_id,
            status="queued",
            created_at=_now_iso(),
            request_summary={
                "question": (request.question or "")[:80],
                "answer": (request.answer or "")[:80],
                "metrics": list(request.metrics),
                "judge_model": request.judge_model or "",
                "embedding_model": request.embedding_model or "",
                "has_contexts": bool(request.contexts),
                "has_ground_truth": bool(request.ground_truth),
                "session_id": session_id,
            },
        )

        with self._lock:
            self._job_cache[job_id] = status
            if session_id not in self._session_jobs:
                self._session_jobs[session_id] = []
            self._session_jobs[session_id].append(job_id)

        self._persist_job_index(status)
        self._persist_session_index(session_id)
        self._executor.submit(self._run, job_id, session_id, run_id, request)
        return status, run_id

    def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
        """Return current status of one call, or None if unknown."""
        with self._lock:
            return self._job_cache.get(job_id)

    def list_jobs(self) -> list[AsyncScoreJobStatus]:
        """Return all session job records, newest first."""
        with self._lock:
            jobs = list(self._job_cache.values())
        jobs.sort(key=lambda j: j.created_at, reverse=True)
        return jobs

    def get_session(self, session_id: str) -> SessionStatus | None:
        """Return aggregate status for a session, or None if unknown."""
        with self._lock:
            job_ids = list(self._session_jobs.get(session_id) or [])
        if not job_ids:
            return None

        run_id = self.session_run_id(session_id)
        run_dir = self._output_dir / run_id

        # Compute live metric means from the CSV (may be mid-update — best effort)
        metric_means = self._read_metric_means(run_dir)

        with self._lock:
            jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]

        latest = max((j.finished_at for j in jobs if j.finished_at), default="")
        return SessionStatus(
            session_id=session_id,
            run_id=run_id,
            call_count=len(job_ids),
            metric_means=metric_means,
            latest_finished_at=latest,
            jobs=sorted(jobs, key=lambda j: j.created_at),
        )

    def list_sessions(self) -> list[SessionStatus]:
        """Return aggregate status for all known sessions."""
        with self._lock:
            session_ids = list(self._session_jobs.keys())
        results = []
        for sid in session_ids:
            status = self.get_session(sid)
            if status is not None:
                results.append(status)
        results.sort(key=lambda s: s.latest_finished_at, reverse=True)
        return results

    # ------------------------------------------------------------------ #
    # Worker
    # ------------------------------------------------------------------ #

    def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
        """Score one sample then append it to the session's shared run artifacts."""
        import logging
        logger = logging.getLogger("webapp.services.session_score_manager")
        self._update_job(job_id, status="running")

        # Lazy imports — keep web server bootable if ragas is not installed.
        from rag_eval.advisor import run_advisor
        from rag_eval.metrics.factory import build_models
        from rag_eval.metrics.weights import compute_weighted_score
        from rag_eval.reporting.writers import write_run_artifacts
        from rag_eval.settings import EvaluationSettings
        from rag_eval.shared.models import (
            DatasetConfig, EvaluationResult, NormalizedSample,
            RuntimeConfig, Scenario,
        )
        from rag_eval.shared.utils import utc_now_iso
        from webapp.services.inline_scorer import inline_scorer

        settings = EvaluationSettings()
        judge_model = request.judge_model or settings.ragas_judge_model
        embedding_model = request.embedding_model or settings.ragas_embedding_model
        effective = request.effective_metrics()
        requested = set(request.metrics)
        skipped = sorted(requested - set(effective))

        t0 = time.monotonic()

        try:
            # --- Scoring (can run concurrently for the same session) ----------
            if effective:
                raw_scores = inline_scorer.score(
                    question=request.question,
                    answer=request.answer,
                    contexts=request.contexts_as_list(),
                    ground_truth=request.ground_truth,
                    metrics=effective,
                    judge_model=judge_model,
                    embedding_model=embedding_model,
                    settings=settings,
                )
            else:
                raw_scores = {}

            latency_ms = int((time.monotonic() - t0) * 1000)
            finished_at = utc_now_iso()

            # Build complete scores for this sample (skipped metrics → None)
            all_scores: dict[str, float | None] = {m: None for m in request.metrics}
            all_scores.update(raw_scores)

            # 综合加权得分计算（已暂时禁用）
            # weighted_raw = compute_weighted_score(
            #     {k: v for k, v in raw_scores.items() if v is not None}, {}
            # )
            # weighted = round(weighted_raw, 4) if weighted_raw is not None else None
            weighted = None

            # --- File I/O must be serialized per session ----------------------
            session_lock = self._get_session_lock(session_id)
            with session_lock:
                run_dir = self._output_dir / run_id
                run_dir.mkdir(parents=True, exist_ok=True)

                # Read all existing rows, then append the new one
                existing_rows = self._read_score_rows(run_dir)
                call_number = len(existing_rows) + 1

                new_row: dict[str, Any] = {
                    "sample_id": f"session-score-{call_number}",
                    "question": request.question,
                    "answer": request.answer or "",
                    "contexts": request.contexts or "",
                    "ground_truth": request.ground_truth or "",
                    "error": "",
                }
                new_row.update(all_scores)

                all_rows = existing_rows + [new_row]

                # Reconstruct NormalizedSample objects for write_run_artifacts metadata
                valid_samples = [
                    NormalizedSample(
                        sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
                        question=str(row.get("question", "")),
                        answer=str(row.get("answer", "")),
                        contexts=[
                            part.strip()
                            for part in str(row.get("contexts", "")).split(" |||| ")
                            if part.strip()
                        ],
                        ground_truth=str(row.get("ground_truth", "")),
                    )
                    for i, row in enumerate(all_rows)
                ]

                # Determine all metric columns (union of all rows' metric keys)
                all_metric_names = sorted({
                    k for row in all_rows
                    for k in row if k not in _NON_METRIC_COLUMNS
                })

                scenario = Scenario(
                    scenario_name=f"session-{_sanitize_session_id(session_id)}",
                    mode="offline",
                    dataset=DatasetConfig(path=run_dir / "dataset.csv"),
                    judge_model=judge_model,
                    embedding_model=embedding_model,
                    metrics=all_metric_names,
                    output_dir=self._output_dir,
                    optimization_advisor=True,
                )

                started_at_val = (
                    existing_rows[0].get("_started_at", finished_at)
                    if existing_rows else finished_at
                )

                result = EvaluationResult(
                    scenario=scenario,
                    run_id=run_id,
                    started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
                    finished_at=finished_at,
                    valid_samples=valid_samples,
                    invalid_samples=[],
                    score_rows=all_rows,
                )

                write_run_artifacts(result)
                logger.info(
                    "[session_job] artifacts written  job_id=%s  session_id=%s  call=%d",
                    job_id, session_id, call_number,
                )

                # Regenerate optimization advice over all accumulated rows
                try:
                    llm, _ = build_models(judge_model, embedding_model, settings)
                    run_advisor(result, scenario, llm)
                    logger.info("[session_job] advisor done  job_id=%s  session=%s", job_id, session_id)
                except Exception as adv_exc:  # noqa: BLE001
                    logger.warning(
                        "[session_job] advisor failed  job_id=%s  err=%s", job_id, adv_exc
                    )

            self._update_job(
                job_id,
                status="completed",
                finished_at=finished_at,
                run_id=run_id,
                scores=all_scores,
                weighted_score=weighted,
                latency_ms=latency_ms,
                skipped_metrics=skipped,
            )
            self._persist_session_index(session_id)

        except Exception as exc:  # noqa: BLE001
            latency_ms = int((time.monotonic() - t0) * 1000)
            import logging as _logging
            _logging.getLogger("webapp.services.session_score_manager").error(
                "[session_job] failed  job_id=%s  err=%s", job_id, exc
            )
            self._update_job(
                job_id,
                status="failed",
                finished_at=_now_iso(),
                latency_ms=latency_ms,
                error=f"{type(exc).__name__}: {exc}",
            )

    # ------------------------------------------------------------------ #
    # Helpers
    # ------------------------------------------------------------------ #

    def _get_session_lock(self, session_id: str) -> threading.Lock:
        with self._lock:
            if session_id not in self._session_locks:
                self._session_locks[session_id] = threading.Lock()
            return self._session_locks[session_id]

    def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
        """Read existing scores.csv rows, returning empty list if file doesn't exist."""
        scores_path = run_dir / "scores.csv"
        if not scores_path.is_file():
            return []
        try:
            frame = pd.read_csv(scores_path)
            return frame.where(pd.notnull(frame), None).to_dict("records")
        except (OSError, ValueError):
            return []

    def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
        """Compute per-metric means from the session's scores.csv."""
        scores_path = run_dir / "scores.csv"
        if not scores_path.is_file():
            return {}
        try:
            frame = pd.read_csv(scores_path)
        except (OSError, ValueError):
            return {}
        means: dict[str, float | None] = {}
        for col in frame.columns:
            if col in _NON_METRIC_COLUMNS:
                continue
            if pd.api.types.is_numeric_dtype(frame[col]):
                val = frame[col].mean(numeric_only=True)
                means[col] = None if pd.isna(val) else round(float(val), 4)
        return means

    def _update_job(self, job_id: str, **kwargs: Any) -> None:
        with self._lock:
            existing = self._job_cache.get(job_id)
            if existing is None:
                return
            updated = existing.model_copy(update=kwargs)
            self._job_cache[job_id] = updated
        self._persist_job_index(updated)

    def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
        """Persist a single job's status to the index directory."""
        path = self._index_dir / f"{status.job_id}.json"
        path.write_text(
            json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

    def _persist_session_index(self, session_id: str) -> None:
        """Persist the session→job_ids mapping."""
        with self._lock:
            job_ids = list(self._session_jobs.get(session_id) or [])
        run_id = self.session_run_id(session_id)
        data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
        path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
        path.write_text(
            json.dumps(data, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

    def _load_existing(self) -> None:
        """Restore job cache and session mappings from persisted index files on startup."""
        # Load individual job files
        for path in sorted(self._index_dir.glob("*.json")):
            try:
                data = json.loads(path.read_text(encoding="utf-8"))
                status = AsyncScoreJobStatus.model_validate(data)
                self._job_cache[status.job_id] = status
            except Exception:  # noqa: BLE001
                pass

        # Load session→job_ids mappings
        sessions_dir = self._index_dir / "_sessions"
        if not sessions_dir.is_dir():
            return
        for path in sorted(sessions_dir.glob("*.json")):
            try:
                data = json.loads(path.read_text(encoding="utf-8"))
                sid = data.get("session_id", "")
                job_ids = data.get("job_ids", [])
                if sid:
                    self._session_jobs[sid] = job_ids
            except Exception:  # noqa: BLE001
                pass


# Module-level singleton shared by FastAPI routes.
session_score_manager = SessionScoreJobManager()