feat(session-async): add /api/score/session_async with incremental session report aggregation

- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing

chore(weighted-score): comment out 综合加权得分 display and computation

- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-06-26 16:09:33 +08:00
parent e1751447df
commit 754a30ad59
36 changed files with 2004 additions and 51 deletions

View File

@@ -156,10 +156,11 @@ def score_sample(
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
all_scores.update(raw_scores)
weighted = compute_weighted_score(
{key: value for key, value in raw_scores.items() if value is not None},
{},
)
# 综合加权得分计算(已暂时禁用)
# weighted = compute_weighted_score(
# {key: value for key, value in raw_scores.items() if value is not None},
# {},
# )
logger.info(
"[score] done latency=%dms skipped=%s scores=%s",
@@ -169,7 +170,7 @@ def score_sample(
)
return ScoreResponse(
scores=all_scores,
weighted_score=round(weighted, 4) if weighted is not None else None,
weighted_score=None, # 综合加权得分已暂时禁用
latency_ms=latency_ms,
skipped_metrics=skipped,
)

View File

@@ -0,0 +1,171 @@
"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
`POST /api/score/session_async` call with a shared `session_id`. All results are
accumulated into one report, visible in 「运行列表」→「报告详情」.
Key behaviour:
- Deterministic run_id: derived from session_id — same session always maps to the
same report directory (outputs/score-session/session-<id>/).
- Append semantics: each call adds a new sample row. Previous rows are preserved.
- Advisor regeneration: optimization_advice.md is regenerated after every call
using the full set of accumulated rows.
- Each call returns its own `job_id` for individual status polling, plus the
shared `run_id` and `session_id`.
Endpoints:
POST /api/score/session_async Submit one call (returns job_id + run_id)
GET /api/score/sessions List all sessions
GET /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
GET /api/score/session/jobs/{job_id} Status of one individual call
"""
from __future__ import annotations
import logging
from fastapi import APIRouter, HTTPException
from webapp.models import (
AsyncScoreJobStatus,
ScoreRequest,
SessionScoreJobResponse,
SessionScoreRequest,
SessionStatus,
)
from webapp.services.session_score_manager import session_score_manager
router = APIRouter(prefix="/api/score", tags=["score"])
logger = logging.getLogger("webapp.api.session_score_jobs")
@router.post(
"/session_async",
status_code=202,
response_model=SessionScoreJobResponse,
summary="提交 Session 异步评分(多样本批量聚合)",
responses={
202: {
"description": (
"调用已排队,立即返回 job_id + run_id202 Accepted\n\n"
"相同 `session_id` 的多次调用合并为同一报告,每次调用新增一个样本行。\n"
"评分完成后,`summary.md` 和 `optimization_advice.md` 增量更新。\n"
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态,"
"通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态,"
"在「运行列表」中查看完整报告run_id 即 `session-<session_id>` 形式)。"
),
"content": {
"application/json": {
"example": {
"job_id": "abc123def456",
"session_id": "dify-session-001",
"run_id": "session-dify-session-001",
"status": "queued",
"call_count": 1,
}
}
},
},
},
)
def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
"""提交 Session 异步 RAGAS 评分,立即返回 job_id。
相同 `session_id` 的多次调用合并到同一评估报告中,每次调用:
1. 新增一个样本行到 `scores.csv`
2. 重写 `summary.md`(包含所有累积样本的指标均值)
3. 重新生成 `optimization_advice.md`(基于全量样本的 LLM 优化建议)
**适合 Dify 工作流**:在循环节点中批量调用,所有轮次共用同一 `session_id`
最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
"""
logger.info(
"[session_async] submit session_id=%s metrics=%s has_ctx=%s has_gt=%s",
request.session_id,
request.metrics,
bool(request.contexts),
bool(request.ground_truth),
)
# Strip session_id to build a plain ScoreRequest for the manager
score_request = ScoreRequest(
question=request.question,
answer=request.answer,
contexts=request.contexts,
ground_truth=request.ground_truth,
context_separator=request.context_separator,
metrics=request.metrics,
judge_model=request.judge_model,
embedding_model=request.embedding_model,
)
status, run_id = session_score_manager.submit(request.session_id, score_request)
# Compute call_count from current session state
session_status = session_score_manager.get_session(request.session_id)
call_count = session_status.call_count if session_status else 1
logger.info(
"[session_async] queued job_id=%s session_id=%s run_id=%s call=%d",
status.job_id, request.session_id, run_id, call_count,
)
return SessionScoreJobResponse(
job_id=status.job_id,
session_id=request.session_id,
run_id=run_id,
status=status.status,
call_count=call_count,
)
@router.get(
"/sessions",
response_model=dict,
summary="列出所有 Session 聚合状态",
)
def list_sessions() -> dict:
"""返回所有 session 的聚合状态,按最近完成时间倒序排列。"""
sessions = session_score_manager.list_sessions()
logger.info("[session_score] list_sessions count=%d", len(sessions))
return {"sessions": [s.model_dump() for s in sessions]}
@router.get(
"/sessions/{session_id}",
response_model=SessionStatus,
summary="查询 Session 聚合状态(指标均值 + 所有调用记录)",
responses={404: {"description": "指定 session_id 不存在。"}},
)
def get_session(session_id: str) -> SessionStatus:
"""查询 session 的聚合评分状态。
返回内容:
- `run_id`:在「运行列表」中查看完整报告
- `call_count`:本 session 累计调用次数
- `metric_means`:所有已累积样本的各指标均值(实时读取 scores.csv
- `jobs`:本 session 所有调用记录列表
"""
status = session_score_manager.get_session(session_id)
if status is None:
raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
return status
@router.get(
"/session/jobs/{job_id}",
response_model=AsyncScoreJobStatus,
summary="查询 Session 单次调用状态",
responses={404: {"description": "指定 job_id 不存在。"}},
)
def get_session_job(job_id: str) -> AsyncScoreJobStatus:
"""查询 session 评分中某次调用的状态和评分结果。
`status` 为 `completed` 时,`run_id` 即所属 session 的报告目录,
`scores` 包含本次调用的各指标得分。
"""
status = session_score_manager.get_job(job_id)
if status is None:
raise HTTPException(
status_code=404, detail=f"Session score job not found: {job_id}"
)
return status

View File

@@ -531,6 +531,50 @@ class AsyncScoreJobResponse(BaseModel):
)
# ---------------------------------------------------------------------------
# Session async 评分模型
# ---------------------------------------------------------------------------
class SessionScoreRequest(ScoreRequest):
"""Request body for session-grouped async scoring.
All calls sharing the same session_id are accumulated into one report.
Each call adds a new sample row to the session's scores.csv.
"""
session_id: str = Field(
description=(
"会话唯一标识符。相同 session_id 的多次调用合并为同一报告,"
"每次调用新增一个样本行,指标均值和优化建议在每次调用后增量更新。"
),
)
class SessionScoreJobResponse(BaseModel):
"""Immediate 202 response after submitting a session scoring call."""
job_id: str = Field(description="本次调用的任务唯一标识符。")
session_id: str = Field(description="会话标识符。")
run_id: str = Field(description="本 session 对应的报告 Run ID可在「运行列表」中查看。")
status: str = Field(default="queued", description="初始状态queued。")
call_count: int = Field(default=1, description="本 session 当前累计调用次数(包含本次)。")
class SessionStatus(BaseModel):
"""Aggregate status and metrics for a scoring session."""
session_id: str = Field(description="会话标识符。")
run_id: str = Field(description="对应报告目录的 Run ID。")
call_count: int = Field(description="本 session 累计调用次数。")
metric_means: dict[str, float | None] = Field(
default_factory=dict, description="所有已累积样本的各指标均值。"
)
latest_finished_at: str = Field(default="", description="最近一次评分完成时间ISO 8601 UTC")
jobs: list[AsyncScoreJobStatus] = Field(
default_factory=list, description="本 session 所有调用记录,按创建时间排序。"
)
class AsyncScoreJobStatus(BaseModel):
"""State of one async score job (queued → running → completed/failed)."""

View File

@@ -17,7 +17,7 @@ from fastapi.exceptions import RequestValidationError
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs, session_score_jobs
STATIC_DIR = Path(__file__).resolve().parent / "static"
logger = logging.getLogger("webapp.server")
@@ -73,6 +73,10 @@ OPENAPI_TAGS = [
"**异步评分 APIDify 推荐)** — `POST /api/score/async`\n\n"
"异步方式立即返回 job_id202评分在后台执行完成后自动生成完整报告含优化建议"
"在「运行列表」页查看。\n\n"
"**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
"适合 Dify 循环节点批量评估:同一 `session_id` 的多次调用合并为一个报告,"
"每次调用新增一个样本行,指标均值和优化建议增量更新。\n"
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n\n"
"通过 `GET /api/score/jobs` 列出所有异步评分记录,"
"`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
@@ -111,6 +115,7 @@ def create_app() -> FastAPI:
app.include_router(pipeline.router)
app.include_router(score.router)
app.include_router(score_jobs.router)
app.include_router(session_score_jobs.router)
@app.middleware("http")
async def access_log_middleware(request: Request, call_next):

View File

@@ -0,0 +1,257 @@
"""Background task manager for end-to-end pipeline jobs (build + eval).
Each job runs three sequential phases inside a worker thread:
1. parsing_documents — AliyunDocmind parses every PDF
2. generating_questions — LLM generates a draft question bank
3. evaluating — RAGAS online evaluation scores each question
The DatasetBuildJob and Scenario objects are constructed entirely from the
API request parameters, so no YAML config files are needed.
"""
from __future__ import annotations
import io
import threading
import uuid
from concurrent.futures import ThreadPoolExecutor
from contextlib import redirect_stderr, redirect_stdout
from datetime import datetime, timezone
from pathlib import Path
from webapp.models import (
PipelineJobRequest,
PipelineJobStatus,
PipelineResult,
)
_REPO_ROOT = Path(__file__).resolve().parents[2]
_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
class _LineCapture(io.TextIOBase):
"""Write-only stream that appends complete lines to a task's log buffer."""
def __init__(self, sink: "PipelineTask") -> None:
self._sink = sink
self._buffer = ""
def write(self, text: str) -> int:
self._buffer += text
while "\n" in self._buffer:
line, self._buffer = self._buffer.split("\n", 1)
self._sink.append_log(line)
return len(text)
def flush(self) -> None:
if self._buffer:
self._sink.append_log(self._buffer)
self._buffer = ""
class PipelineTask:
"""Mutable state for one pipeline job (build + eval)."""
def __init__(self, job_id: str, job_name: str) -> None:
self.job_id = job_id
self.job_name = job_name
self.status = "queued"
self.phase = "idle"
self.logs: list[str] = []
self.result: PipelineResult | None = None
self.error: str | None = None
self.created_at = _now_iso()
self.finished_at = ""
self._lock = threading.Lock()
def append_log(self, line: str) -> None:
with self._lock:
self.logs.append(line)
def snapshot(self) -> PipelineJobStatus:
with self._lock:
return PipelineJobStatus(
job_id=self.job_id,
job_name=self.job_name,
status=self.status,
phase=self.phase,
logs=list(self.logs),
result=self.result,
error=self.error,
created_at=self.created_at,
finished_at=self.finished_at,
)
class PipelineTaskManager:
"""Owns the thread pool and registry of pipeline jobs."""
def __init__(self, max_workers: int = 2) -> None:
self._executor = ThreadPoolExecutor(max_workers=max_workers)
self._tasks: dict[str, PipelineTask] = {}
self._lock = threading.Lock()
def submit(self, request: PipelineJobRequest) -> PipelineTask:
"""Register and schedule a new pipeline job; return its task object."""
job_id = uuid.uuid4().hex[:12]
job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"
task = PipelineTask(job_id=job_id, job_name=job_name)
with self._lock:
self._tasks[job_id] = task
self._executor.submit(self._run, task, request)
return task
def get(self, job_id: str) -> PipelineJobStatus | None:
with self._lock:
task = self._tasks.get(job_id)
return task.snapshot() if task is not None else None
def list_jobs(self) -> list[PipelineJobStatus]:
with self._lock:
tasks = list(self._tasks.values())
snapshots = [t.snapshot() for t in tasks]
snapshots.sort(key=lambda s: s.created_at, reverse=True)
return snapshots
# ------------------------------------------------------------------ #
# Worker
# ------------------------------------------------------------------ #
def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:
"""Execute the full pipeline end to end inside a worker thread."""
task.status = "running"
task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")
capture = _LineCapture(task)
try:
with redirect_stdout(capture), redirect_stderr(capture):
result = self._execute(task, request)
capture.flush()
task.result = result
task.phase = "done"
task.status = "completed"
task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")
except Exception as exc: # noqa: BLE001
capture.flush()
task.error = f"{type(exc).__name__}: {exc}"
task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")
task.status = "failed"
finally:
task.finished_at = _now_iso()
def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:
"""Run build then eval, updating task.phase as we go."""
# ── resolve paths ──────────────────────────────────────────────
docs_path = Path(req.docs_path)
if not docs_path.is_absolute():
docs_path = (_REPO_ROOT / docs_path).resolve()
if not docs_path.is_dir():
raise ValueError(f"docs_path is not an existing directory: {docs_path}")
job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id
build_artifact_dir = job_output_dir / "build"
dataset_csv = job_output_dir / "generated_dataset.csv"
eval_output_dir = job_output_dir / "eval"
# ── phase 1 + 2: dataset build (parse & generate) ─────────────
task.phase = "parsing_documents"
task.append_log(f" [build] 扫描文档目录: {docs_path}")
build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)
source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"
total_q = len(build_result.draft_samples)
parse_failures = len(build_result.parse_failures)
task.append_log(f" [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")
if total_q == 0:
raise RuntimeError("题库为空(所有文档均解析或生成失败),中止评估。")
# ── phase 3: evaluation ────────────────────────────────────────
task.phase = "evaluating"
task.append_log(f" [eval] 开始 RAGAS 评估,共 {total_q} 道题目")
eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)
from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths
eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)
return PipelineResult(
build_artifact_dir=build_artifact_dir.as_posix(),
dataset_csv=dataset_csv.as_posix(),
source_chunks_jsonl=source_chunks_jsonl.as_posix(),
total_questions=total_q,
parse_failures=parse_failures,
eval_run_id=eval_result.run_id,
eval_output_dir=eval_result.scenario.output_dir.as_posix(),
scores_csv=eval_artifact_paths.scores_csv.as_posix(),
summary_md=eval_artifact_paths.summary_md.as_posix(),
)
def _run_build(self, task: PipelineTask, req: PipelineJobRequest,
docs_path: Path, artifact_dir: Path, dataset_csv: Path):
"""Construct DatasetBuildJob and run the build phase."""
from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
from rag_eval.dataset_builder.runner import execute_dataset_build_job
from rag_eval.settings import EvaluationSettings
settings = EvaluationSettings()
job = DatasetBuildJob(
job_name=task.job_name,
input_path=docs_path,
input_glob="*.pdf",
parser_provider="aliyun_docmind",
failure_mode=req.failure_mode, # type: ignore[arg-type]
generation_model=req.generation_model,
output_type="online_question_bank",
review_mode="draft_with_manual_review",
max_questions_per_document=req.max_questions_per_document,
max_source_chunks_per_question=req.max_source_chunks_per_question,
dataset_path=dataset_csv,
artifact_dir=artifact_dir,
runtime=DatasetBuildRuntime(max_documents=req.max_documents),
)
return execute_dataset_build_job(job, settings=settings)
def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,
dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):
"""Construct Scenario and run the evaluation phase."""
from rag_eval.execution.runner import run_scenario_from_scenario_obj
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import (
AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,
)
settings = EvaluationSettings()
scenario = Scenario(
scenario_name=task.job_name,
mode="online",
dataset=DatasetConfig(path=dataset_csv),
judge_model=req.judge_model,
embedding_model=req.embedding_model,
metrics=list(req.metrics),
output_dir=eval_output_dir,
runtime=RuntimeConfig(
batch_size=4,
app_concurrency=2,
metric_concurrency=2,
max_samples=req.max_samples,
),
app_adapter=AppAdapterConfig(
type="python",
callable="apps.siemens_pdf_qa.adapter:run",
static_kwargs={
"source_chunks_path": source_chunks_jsonl,
"model": req.answer_model,
},
),
optimization_advisor=req.optimization_advisor,
)
return run_scenario_from_scenario_obj(scenario, settings=settings)
# Module-level singleton shared by the FastAPI routes.
pipeline_task_manager = PipelineTaskManager()

View File

@@ -177,9 +177,11 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
overall_ws = compute_overall_weighted_score_mean(
score_rows_list, metric_weights, doc_weights
)
# 综合加权得分计算(已暂时禁用)
# overall_ws = compute_overall_weighted_score_mean(
# score_rows_list, metric_weights, doc_weights
# )
overall_ws = None
distributions = {
metric: _distribution(frame, metric)

View File

@@ -149,10 +149,12 @@ class ScoreJobManager:
# Build full scores dict (skipped = None)
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
all_scores.update(raw_scores)
weighted_raw = compute_weighted_score(
{k: v for k, v in raw_scores.items() if v is not None}, {}
)
weighted = round(weighted_raw, 4) if weighted_raw is not None else None
# 综合加权得分计算(已暂时禁用)
# weighted_raw = compute_weighted_score(
# {k: v for k, v in raw_scores.items() if v is not None}, {}
# )
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
weighted = None
# Build a score row compatible with report_builder
score_row: dict[str, Any] = {

View File

@@ -0,0 +1,452 @@
"""Background task manager for session-grouped async RAGAS scoring.
Each session groups multiple scoring calls into one shared run report:
1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
2. Every call: appends a new sample row to scores.csv, rewrites summary.md
and optimization_advice.md by re-running write_run_artifacts + run_advisor
over ALL accumulated rows.
3. The resulting run directory is picked up automatically by run_reader, so the
「运行列表」 and 「报告详情」 pages show the live, growing report.
Concurrency model:
- Scoring (LLM network I/O) runs freely in the thread pool — different sessions
score concurrently; multiple calls to the same session also start scoring in
parallel.
- File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
a per-session threading.Lock, so no two calls corrupt the same session's CSV.
"""
from __future__ import annotations
import json
import re
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import pandas as pd
from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
_REPO_ROOT = Path(__file__).resolve().parents[2]
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
_NON_METRIC_COLUMNS = {
"sample_id", "question", "contexts", "answer", "ground_truth",
"scenario", "language", "retrieval_config", "error",
"judge_model", "embedding_model", "run_id", "difficulty",
"question_type", "doc_id", "doc_name", "section_path",
"page_start", "page_end", "source_chunk_ids", "review_status",
"review_notes", "weighted_score", "sample_weight",
}
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _sanitize_session_id(session_id: str) -> str:
"""Convert an arbitrary session_id string to a safe directory-name fragment."""
return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
class SessionScoreJobManager:
"""Thread-pool manager for session-grouped async scoring jobs.
All calls sharing a session_id append to one shared run directory, so the
report detail page shows all samples and their aggregate metrics together.
"""
def __init__(
self,
output_dir: Path = _DEFAULT_OUTPUT_DIR,
index_dir: Path = _DEFAULT_INDEX_DIR,
max_workers: int = 4,
) -> None:
self._output_dir = Path(output_dir)
self._index_dir = Path(index_dir)
self._output_dir.mkdir(parents=True, exist_ok=True)
self._index_dir.mkdir(parents=True, exist_ok=True)
(self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
self._executor = ThreadPoolExecutor(max_workers=max_workers)
# job_id -> AsyncScoreJobStatus; guarded by _lock
self._job_cache: dict[str, AsyncScoreJobStatus] = {}
# session_id -> [job_ids in order]; guarded by _lock
self._session_jobs: dict[str, list[str]] = {}
# session_id -> per-session threading.Lock; guarded by _lock
self._session_locks: dict[str, threading.Lock] = {}
self._lock = threading.Lock()
self._load_existing()
# ------------------------------------------------------------------ #
# Public API
# ------------------------------------------------------------------ #
def session_run_id(self, session_id: str) -> str:
"""Return the deterministic run_id for a session (also the dir name)."""
return f"session-{_sanitize_session_id(session_id)}"
def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
"""Queue one scoring call for a session.
Returns (job_status, run_id). run_id is deterministic from session_id.
"""
run_id = self.session_run_id(session_id)
job_id = uuid.uuid4().hex[:12]
status = AsyncScoreJobStatus(
job_id=job_id,
status="queued",
created_at=_now_iso(),
request_summary={
"question": (request.question or "")[:80],
"answer": (request.answer or "")[:80],
"metrics": list(request.metrics),
"judge_model": request.judge_model or "",
"embedding_model": request.embedding_model or "",
"has_contexts": bool(request.contexts),
"has_ground_truth": bool(request.ground_truth),
"session_id": session_id,
},
)
with self._lock:
self._job_cache[job_id] = status
if session_id not in self._session_jobs:
self._session_jobs[session_id] = []
self._session_jobs[session_id].append(job_id)
self._persist_job_index(status)
self._persist_session_index(session_id)
self._executor.submit(self._run, job_id, session_id, run_id, request)
return status, run_id
def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
"""Return current status of one call, or None if unknown."""
with self._lock:
return self._job_cache.get(job_id)
def list_jobs(self) -> list[AsyncScoreJobStatus]:
"""Return all session job records, newest first."""
with self._lock:
jobs = list(self._job_cache.values())
jobs.sort(key=lambda j: j.created_at, reverse=True)
return jobs
def get_session(self, session_id: str) -> SessionStatus | None:
"""Return aggregate status for a session, or None if unknown."""
with self._lock:
job_ids = list(self._session_jobs.get(session_id) or [])
if not job_ids:
return None
run_id = self.session_run_id(session_id)
run_dir = self._output_dir / run_id
# Compute live metric means from the CSV (may be mid-update — best effort)
metric_means = self._read_metric_means(run_dir)
with self._lock:
jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
latest = max((j.finished_at for j in jobs if j.finished_at), default="")
return SessionStatus(
session_id=session_id,
run_id=run_id,
call_count=len(job_ids),
metric_means=metric_means,
latest_finished_at=latest,
jobs=sorted(jobs, key=lambda j: j.created_at),
)
def list_sessions(self) -> list[SessionStatus]:
"""Return aggregate status for all known sessions."""
with self._lock:
session_ids = list(self._session_jobs.keys())
results = []
for sid in session_ids:
status = self.get_session(sid)
if status is not None:
results.append(status)
results.sort(key=lambda s: s.latest_finished_at, reverse=True)
return results
# ------------------------------------------------------------------ #
# Worker
# ------------------------------------------------------------------ #
def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
"""Score one sample then append it to the session's shared run artifacts."""
import logging
logger = logging.getLogger("webapp.services.session_score_manager")
self._update_job(job_id, status="running")
# Lazy imports — keep web server bootable if ragas is not installed.
from rag_eval.advisor import run_advisor
from rag_eval.metrics.factory import build_models
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.reporting.writers import write_run_artifacts
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import (
DatasetConfig, EvaluationResult, NormalizedSample,
RuntimeConfig, Scenario,
)
from rag_eval.shared.utils import utc_now_iso
from webapp.services.inline_scorer import inline_scorer
settings = EvaluationSettings()
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
t0 = time.monotonic()
try:
# --- Scoring (can run concurrently for the same session) ----------
if effective:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
else:
raw_scores = {}
latency_ms = int((time.monotonic() - t0) * 1000)
finished_at = utc_now_iso()
# Build complete scores for this sample (skipped metrics → None)
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
all_scores.update(raw_scores)
# 综合加权得分计算(已暂时禁用)
# weighted_raw = compute_weighted_score(
# {k: v for k, v in raw_scores.items() if v is not None}, {}
# )
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
weighted = None
# --- File I/O must be serialized per session ----------------------
session_lock = self._get_session_lock(session_id)
with session_lock:
run_dir = self._output_dir / run_id
run_dir.mkdir(parents=True, exist_ok=True)
# Read all existing rows, then append the new one
existing_rows = self._read_score_rows(run_dir)
call_number = len(existing_rows) + 1
new_row: dict[str, Any] = {
"sample_id": f"session-score-{call_number}",
"question": request.question,
"answer": request.answer or "",
"contexts": request.contexts or "",
"ground_truth": request.ground_truth or "",
"error": "",
}
new_row.update(all_scores)
all_rows = existing_rows + [new_row]
# Reconstruct NormalizedSample objects for write_run_artifacts metadata
valid_samples = [
NormalizedSample(
sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
question=str(row.get("question", "")),
answer=str(row.get("answer", "")),
contexts=[
part.strip()
for part in str(row.get("contexts", "")).split(" |||| ")
if part.strip()
],
ground_truth=str(row.get("ground_truth", "")),
)
for i, row in enumerate(all_rows)
]
# Determine all metric columns (union of all rows' metric keys)
all_metric_names = sorted({
k for row in all_rows
for k in row if k not in _NON_METRIC_COLUMNS
})
scenario = Scenario(
scenario_name=f"session-{_sanitize_session_id(session_id)}",
mode="offline",
dataset=DatasetConfig(path=run_dir / "dataset.csv"),
judge_model=judge_model,
embedding_model=embedding_model,
metrics=all_metric_names,
output_dir=self._output_dir,
optimization_advisor=True,
)
started_at_val = (
existing_rows[0].get("_started_at", finished_at)
if existing_rows else finished_at
)
result = EvaluationResult(
scenario=scenario,
run_id=run_id,
started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
finished_at=finished_at,
valid_samples=valid_samples,
invalid_samples=[],
score_rows=all_rows,
)
write_run_artifacts(result)
logger.info(
"[session_job] artifacts written job_id=%s session_id=%s call=%d",
job_id, session_id, call_number,
)
# Regenerate optimization advice over all accumulated rows
try:
llm, _ = build_models(judge_model, embedding_model, settings)
run_advisor(result, scenario, llm)
logger.info("[session_job] advisor done job_id=%s session=%s", job_id, session_id)
except Exception as adv_exc: # noqa: BLE001
logger.warning(
"[session_job] advisor failed job_id=%s err=%s", job_id, adv_exc
)
self._update_job(
job_id,
status="completed",
finished_at=finished_at,
run_id=run_id,
scores=all_scores,
weighted_score=weighted,
latency_ms=latency_ms,
skipped_metrics=skipped,
)
self._persist_session_index(session_id)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
import logging as _logging
_logging.getLogger("webapp.services.session_score_manager").error(
"[session_job] failed job_id=%s err=%s", job_id, exc
)
self._update_job(
job_id,
status="failed",
finished_at=_now_iso(),
latency_ms=latency_ms,
error=f"{type(exc).__name__}: {exc}",
)
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
def _get_session_lock(self, session_id: str) -> threading.Lock:
with self._lock:
if session_id not in self._session_locks:
self._session_locks[session_id] = threading.Lock()
return self._session_locks[session_id]
def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
"""Read existing scores.csv rows, returning empty list if file doesn't exist."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return []
try:
frame = pd.read_csv(scores_path)
return frame.where(pd.notnull(frame), None).to_dict("records")
except (OSError, ValueError):
return []
def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
"""Compute per-metric means from the session's scores.csv."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return {}
try:
frame = pd.read_csv(scores_path)
except (OSError, ValueError):
return {}
means: dict[str, float | None] = {}
for col in frame.columns:
if col in _NON_METRIC_COLUMNS:
continue
if pd.api.types.is_numeric_dtype(frame[col]):
val = frame[col].mean(numeric_only=True)
means[col] = None if pd.isna(val) else round(float(val), 4)
return means
def _update_job(self, job_id: str, **kwargs: Any) -> None:
with self._lock:
existing = self._job_cache.get(job_id)
if existing is None:
return
updated = existing.model_copy(update=kwargs)
self._job_cache[job_id] = updated
self._persist_job_index(updated)
def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
"""Persist a single job's status to the index directory."""
path = self._index_dir / f"{status.job_id}.json"
path.write_text(
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _persist_session_index(self, session_id: str) -> None:
"""Persist the session→job_ids mapping."""
with self._lock:
job_ids = list(self._session_jobs.get(session_id) or [])
run_id = self.session_run_id(session_id)
data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _load_existing(self) -> None:
"""Restore job cache and session mappings from persisted index files on startup."""
# Load individual job files
for path in sorted(self._index_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
status = AsyncScoreJobStatus.model_validate(data)
self._job_cache[status.job_id] = status
except Exception: # noqa: BLE001
pass
# Load session→job_ids mappings
sessions_dir = self._index_dir / "_sessions"
if not sessions_dir.is_dir():
return
for path in sorted(sessions_dir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
sid = data.get("session_id", "")
job_ids = data.get("job_ids", [])
if sid:
self._session_jobs[sid] = job_ids
except Exception: # noqa: BLE001
pass
# Module-level singleton shared by FastAPI routes.
session_score_manager = SessionScoreJobManager()

View File

@@ -128,17 +128,17 @@ const Report = {
wrap.appendChild(card);
});
// 综合加权得分卡片
const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
const wsCard = document.createElement("div");
wsCard.className = "metric-card weighted-score-card";
const wsCls = App.scoreClass(wsValue);
const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
wsCard.innerHTML = `
<div class="metric-value ${wsCls}">${wsText}</div>
<div class="metric-name">综合加权得分</div>
`;
wrap.appendChild(wsCard);
// 综合加权得分卡片(已暂时隐藏)
// const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
// const wsCard = document.createElement("div");
// wsCard.className = "metric-card weighted-score-card";
// const wsCls = App.scoreClass(wsValue);
// const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
// wsCard.innerHTML = `
// <div class="metric-value ${wsCls}">${wsText}</div>
// <div class="metric-name">综合加权得分</div>
// `;
// wrap.appendChild(wsCard);
},
// ② 分数分布直方图(可切换指标)。

View File

@@ -55,10 +55,11 @@ const ScoreJobs = {
return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
})
.join(" ");
if (job.weighted_score !== null && job.weighted_score !== undefined) {
const cls = App.scoreClass(job.weighted_score);
scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
}
// 综合加权得分(已暂时隐藏)
// if (job.weighted_score !== null && job.weighted_score !== undefined) {
// const cls = App.scoreClass(job.weighted_score);
// scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
// }
} else if (job.status === "failed") {
scoreHtml = `<span style="color:var(--bad);font-size:12px">${App.escape((job.error || "").slice(0, 80))}</span>`;
} else {