feat(session-async): add /api/score/session_async with incremental session report aggregation

- New POST /api/score/session_async endpoint: same session_id calls append to one shared report - New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records - New GET /api/score/session/jobs/{job_id}: individual call status - SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call - SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added - 24 new tests, all passing chore(weighted-score): comment out 综合加权得分 display and computation - report.js: hide 综合加权得分 card in report detail page - score_jobs.js: hide 综合 chip in async job list - report_builder.py: overall_ws=None (computation disabled) - summary.py: weighted_score summary line disabled - evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv - score.py /api/score: weighted_score always returns null - score_job_manager.py + session_score_manager.py: weighted=None - Updated 3 tests to match new behaviour (6 pre-existing failures unchanged) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00
parent e1751447df
commit 754a30ad59
36 changed files with 2004 additions and 51 deletions
--- a/webapp/api/score.py
+++ b/webapp/api/score.py
@@ -156,10 +156,11 @@ def score_sample(
    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
    all_scores.update(raw_scores)

-    weighted = compute_weighted_score(
-        {key: value for key, value in raw_scores.items() if value is not None},
-        {},
-    )
+    # 综合加权得分计算（已暂时禁用）
+    # weighted = compute_weighted_score(
+    #     {key: value for key, value in raw_scores.items() if value is not None},
+    #     {},
+    # )

    logger.info(
        "[score] done  latency=%dms  skipped=%s  scores=%s",
@@ -169,7 +170,7 @@ def score_sample(
    )
    return ScoreResponse(
        scores=all_scores,
-        weighted_score=round(weighted, 4) if weighted is not None else None,
+        weighted_score=None,  # 综合加权得分已暂时禁用
        latency_ms=latency_ms,
        skipped_metrics=skipped,
    )
--- a/webapp/api/session_score_jobs.py
+++ b/webapp/api/session_score_jobs.py
@@ -0,0 +1,171 @@
+"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
+
+Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
+`POST /api/score/session_async` call with a shared `session_id`. All results are
+accumulated into one report, visible in 「运行列表」→「报告详情」.
+
+Key behaviour:
+  - Deterministic run_id: derived from session_id — same session always maps to the
+    same report directory (outputs/score-session/session-<id>/).
+  - Append semantics: each call adds a new sample row. Previous rows are preserved.
+  - Advisor regeneration: optimization_advice.md is regenerated after every call
+    using the full set of accumulated rows.
+  - Each call returns its own `job_id` for individual status polling, plus the
+    shared `run_id` and `session_id`.
+
+Endpoints:
+  POST /api/score/session_async         Submit one call (returns job_id + run_id)
+  GET  /api/score/sessions              List all sessions
+  GET  /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
+  GET  /api/score/session/jobs/{job_id} Status of one individual call
+"""
+
+from __future__ import annotations
+
+import logging
+
+from fastapi import APIRouter, HTTPException
+
+from webapp.models import (
+    AsyncScoreJobStatus,
+    ScoreRequest,
+    SessionScoreJobResponse,
+    SessionScoreRequest,
+    SessionStatus,
+)
+from webapp.services.session_score_manager import session_score_manager
+
+router = APIRouter(prefix="/api/score", tags=["score"])
+logger = logging.getLogger("webapp.api.session_score_jobs")
+
+
+@router.post(
+    "/session_async",
+    status_code=202,
+    response_model=SessionScoreJobResponse,
+    summary="提交 Session 异步评分（多样本批量聚合）",
+    responses={
+        202: {
+            "description": (
+                "调用已排队，立即返回 job_id + run_id（202 Accepted）。\n\n"
+                "相同 `session_id` 的多次调用合并为同一报告，每次调用新增一个样本行。\n"
+                "评分完成后，`summary.md` 和 `optimization_advice.md` 增量更新。\n"
+                "通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态，"
+                "通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态，"
+                "在「运行列表」中查看完整报告（run_id 即 `session-<session_id>` 形式）。"
+            ),
+            "content": {
+                "application/json": {
+                    "example": {
+                        "job_id": "abc123def456",
+                        "session_id": "dify-session-001",
+                        "run_id": "session-dify-session-001",
+                        "status": "queued",
+                        "call_count": 1,
+                    }
+                }
+            },
+        },
+    },
+)
+def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
+    """提交 Session 异步 RAGAS 评分，立即返回 job_id。
+
+    相同 `session_id` 的多次调用合并到同一评估报告中，每次调用：
+    1. 新增一个样本行到 `scores.csv`
+    2. 重写 `summary.md`（包含所有累积样本的指标均值）
+    3. 重新生成 `optimization_advice.md`（基于全量样本的 LLM 优化建议）
+
+    **适合 Dify 工作流**：在循环节点中批量调用，所有轮次共用同一 `session_id`，
+    最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
+    """
+    logger.info(
+        "[session_async] submit  session_id=%s  metrics=%s  has_ctx=%s  has_gt=%s",
+        request.session_id,
+        request.metrics,
+        bool(request.contexts),
+        bool(request.ground_truth),
+    )
+
+    # Strip session_id to build a plain ScoreRequest for the manager
+    score_request = ScoreRequest(
+        question=request.question,
+        answer=request.answer,
+        contexts=request.contexts,
+        ground_truth=request.ground_truth,
+        context_separator=request.context_separator,
+        metrics=request.metrics,
+        judge_model=request.judge_model,
+        embedding_model=request.embedding_model,
+    )
+
+    status, run_id = session_score_manager.submit(request.session_id, score_request)
+
+    # Compute call_count from current session state
+    session_status = session_score_manager.get_session(request.session_id)
+    call_count = session_status.call_count if session_status else 1
+
+    logger.info(
+        "[session_async] queued  job_id=%s  session_id=%s  run_id=%s  call=%d",
+        status.job_id, request.session_id, run_id, call_count,
+    )
+    return SessionScoreJobResponse(
+        job_id=status.job_id,
+        session_id=request.session_id,
+        run_id=run_id,
+        status=status.status,
+        call_count=call_count,
+    )
+
+
+@router.get(
+    "/sessions",
+    response_model=dict,
+    summary="列出所有 Session 聚合状态",
+)
+def list_sessions() -> dict:
+    """返回所有 session 的聚合状态，按最近完成时间倒序排列。"""
+    sessions = session_score_manager.list_sessions()
+    logger.info("[session_score] list_sessions  count=%d", len(sessions))
+    return {"sessions": [s.model_dump() for s in sessions]}
+
+
+@router.get(
+    "/sessions/{session_id}",
+    response_model=SessionStatus,
+    summary="查询 Session 聚合状态（指标均值 + 所有调用记录）",
+    responses={404: {"description": "指定 session_id 不存在。"}},
+)
+def get_session(session_id: str) -> SessionStatus:
+    """查询 session 的聚合评分状态。
+
+    返回内容：
+    - `run_id`：在「运行列表」中查看完整报告
+    - `call_count`：本 session 累计调用次数
+    - `metric_means`：所有已累积样本的各指标均值（实时读取 scores.csv）
+    - `jobs`：本 session 所有调用记录列表
+    """
+    status = session_score_manager.get_session(session_id)
+    if status is None:
+        raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
+    return status
+
+
+@router.get(
+    "/session/jobs/{job_id}",
+    response_model=AsyncScoreJobStatus,
+    summary="查询 Session 单次调用状态",
+    responses={404: {"description": "指定 job_id 不存在。"}},
+)
+def get_session_job(job_id: str) -> AsyncScoreJobStatus:
+    """查询 session 评分中某次调用的状态和评分结果。
+
+    `status` 为 `completed` 时，`run_id` 即所属 session 的报告目录，
+    `scores` 包含本次调用的各指标得分。
+    """
+    status = session_score_manager.get_job(job_id)
+    if status is None:
+        raise HTTPException(
+            status_code=404, detail=f"Session score job not found: {job_id}"
+        )
+    return status
--- a/webapp/models.py
+++ b/webapp/models.py
@@ -531,6 +531,50 @@ class AsyncScoreJobResponse(BaseModel):
    )


+# ---------------------------------------------------------------------------
+# Session async 评分模型
+# ---------------------------------------------------------------------------
+
+class SessionScoreRequest(ScoreRequest):
+    """Request body for session-grouped async scoring.
+
+    All calls sharing the same session_id are accumulated into one report.
+    Each call adds a new sample row to the session's scores.csv.
+    """
+
+    session_id: str = Field(
+        description=(
+            "会话唯一标识符。相同 session_id 的多次调用合并为同一报告，"
+            "每次调用新增一个样本行，指标均值和优化建议在每次调用后增量更新。"
+        ),
+    )
+
+
+class SessionScoreJobResponse(BaseModel):
+    """Immediate 202 response after submitting a session scoring call."""
+
+    job_id: str = Field(description="本次调用的任务唯一标识符。")
+    session_id: str = Field(description="会话标识符。")
+    run_id: str = Field(description="本 session 对应的报告 Run ID，可在「运行列表」中查看。")
+    status: str = Field(default="queued", description="初始状态：queued。")
+    call_count: int = Field(default=1, description="本 session 当前累计调用次数（包含本次）。")
+
+
+class SessionStatus(BaseModel):
+    """Aggregate status and metrics for a scoring session."""
+
+    session_id: str = Field(description="会话标识符。")
+    run_id: str = Field(description="对应报告目录的 Run ID。")
+    call_count: int = Field(description="本 session 累计调用次数。")
+    metric_means: dict[str, float | None] = Field(
+        default_factory=dict, description="所有已累积样本的各指标均值。"
+    )
+    latest_finished_at: str = Field(default="", description="最近一次评分完成时间（ISO 8601 UTC）。")
+    jobs: list[AsyncScoreJobStatus] = Field(
+        default_factory=list, description="本 session 所有调用记录，按创建时间排序。"
+    )
+
+
 class AsyncScoreJobStatus(BaseModel):
    """State of one async score job (queued → running → completed/failed)."""

--- a/webapp/server.py
+++ b/webapp/server.py
@@ -17,7 +17,7 @@ from fastapi.exceptions import RequestValidationError
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles

-from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs
+from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs, session_score_jobs

 STATIC_DIR = Path(__file__).resolve().parent / "static"
 logger = logging.getLogger("webapp.server")
@@ -73,6 +73,10 @@ OPENAPI_TAGS = [
            "**异步评分 API（Dify 推荐）** — `POST /api/score/async`\n\n"
            "异步方式立即返回 job_id（202），评分在后台执行，完成后自动生成完整报告（含优化建议），"
            "在「运行列表」页查看。\n\n"
+            "**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
+            "适合 Dify 循环节点批量评估：同一 `session_id` 的多次调用合并为一个报告，"
+            "每次调用新增一个样本行，指标均值和优化建议增量更新。\n"
+            "通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n\n"
            "通过 `GET /api/score/jobs` 列出所有异步评分记录，"
            "`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
@@ -111,6 +115,7 @@ def create_app() -> FastAPI:
    app.include_router(pipeline.router)
    app.include_router(score.router)
    app.include_router(score_jobs.router)
+    app.include_router(session_score_jobs.router)

    @app.middleware("http")
    async def access_log_middleware(request: Request, call_next):
--- a/webapp/services/pipeline_task_manager.py
+++ b/webapp/services/pipeline_task_manager.py
@@ -0,0 +1,257 @@
+"""Background task manager for end-to-end pipeline jobs (build + eval).
+
+Each job runs three sequential phases inside a worker thread:
+  1. parsing_documents  — AliyunDocmind parses every PDF
+  2. generating_questions — LLM generates a draft question bank
+  3. evaluating          — RAGAS online evaluation scores each question
+
+The DatasetBuildJob and Scenario objects are constructed entirely from the
+API request parameters, so no YAML config files are needed.
+"""
+
+from __future__ import annotations
+
+import io
+import threading
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import redirect_stderr, redirect_stdout
+from datetime import datetime, timezone
+from pathlib import Path
+
+from webapp.models import (
+    PipelineJobRequest,
+    PipelineJobStatus,
+    PipelineResult,
+)
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+class _LineCapture(io.TextIOBase):
+    """Write-only stream that appends complete lines to a task's log buffer."""
+
+    def __init__(self, sink: "PipelineTask") -> None:
+        self._sink = sink
+        self._buffer = ""
+
+    def write(self, text: str) -> int:
+        self._buffer += text
+        while "\n" in self._buffer:
+            line, self._buffer = self._buffer.split("\n", 1)
+            self._sink.append_log(line)
+        return len(text)
+
+    def flush(self) -> None:
+        if self._buffer:
+            self._sink.append_log(self._buffer)
+            self._buffer = ""
+
+
+class PipelineTask:
+    """Mutable state for one pipeline job (build + eval)."""
+
+    def __init__(self, job_id: str, job_name: str) -> None:
+        self.job_id = job_id
+        self.job_name = job_name
+        self.status = "queued"
+        self.phase = "idle"
+        self.logs: list[str] = []
+        self.result: PipelineResult | None = None
+        self.error: str | None = None
+        self.created_at = _now_iso()
+        self.finished_at = ""
+        self._lock = threading.Lock()
+
+    def append_log(self, line: str) -> None:
+        with self._lock:
+            self.logs.append(line)
+
+    def snapshot(self) -> PipelineJobStatus:
+        with self._lock:
+            return PipelineJobStatus(
+                job_id=self.job_id,
+                job_name=self.job_name,
+                status=self.status,
+                phase=self.phase,
+                logs=list(self.logs),
+                result=self.result,
+                error=self.error,
+                created_at=self.created_at,
+                finished_at=self.finished_at,
+            )
+
+
+class PipelineTaskManager:
+    """Owns the thread pool and registry of pipeline jobs."""
+
+    def __init__(self, max_workers: int = 2) -> None:
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._tasks: dict[str, PipelineTask] = {}
+        self._lock = threading.Lock()
+
+    def submit(self, request: PipelineJobRequest) -> PipelineTask:
+        """Register and schedule a new pipeline job; return its task object."""
+        job_id = uuid.uuid4().hex[:12]
+        job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"
+        task = PipelineTask(job_id=job_id, job_name=job_name)
+        with self._lock:
+            self._tasks[job_id] = task
+        self._executor.submit(self._run, task, request)
+        return task
+
+    def get(self, job_id: str) -> PipelineJobStatus | None:
+        with self._lock:
+            task = self._tasks.get(job_id)
+        return task.snapshot() if task is not None else None
+
+    def list_jobs(self) -> list[PipelineJobStatus]:
+        with self._lock:
+            tasks = list(self._tasks.values())
+        snapshots = [t.snapshot() for t in tasks]
+        snapshots.sort(key=lambda s: s.created_at, reverse=True)
+        return snapshots
+
+    # ------------------------------------------------------------------ #
+    # Worker
+    # ------------------------------------------------------------------ #
+
+    def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:
+        """Execute the full pipeline end to end inside a worker thread."""
+        task.status = "running"
+        task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")
+
+        capture = _LineCapture(task)
+        try:
+            with redirect_stdout(capture), redirect_stderr(capture):
+                result = self._execute(task, request)
+            capture.flush()
+            task.result = result
+            task.phase = "done"
+            task.status = "completed"
+            task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")
+        except Exception as exc:  # noqa: BLE001
+            capture.flush()
+            task.error = f"{type(exc).__name__}: {exc}"
+            task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")
+            task.status = "failed"
+        finally:
+            task.finished_at = _now_iso()
+
+    def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:
+        """Run build then eval, updating task.phase as we go."""
+
+        # ── resolve paths ──────────────────────────────────────────────
+        docs_path = Path(req.docs_path)
+        if not docs_path.is_absolute():
+            docs_path = (_REPO_ROOT / docs_path).resolve()
+        if not docs_path.is_dir():
+            raise ValueError(f"docs_path is not an existing directory: {docs_path}")
+
+        job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id
+        build_artifact_dir = job_output_dir / "build"
+        dataset_csv = job_output_dir / "generated_dataset.csv"
+        eval_output_dir = job_output_dir / "eval"
+
+        # ── phase 1 + 2: dataset build (parse & generate) ─────────────
+        task.phase = "parsing_documents"
+        task.append_log(f"  [build] 扫描文档目录: {docs_path}")
+        build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)
+
+        source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"
+        total_q = len(build_result.draft_samples)
+        parse_failures = len(build_result.parse_failures)
+        task.append_log(f"  [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")
+
+        if total_q == 0:
+            raise RuntimeError("题库为空（所有文档均解析或生成失败），中止评估。")
+
+        # ── phase 3: evaluation ────────────────────────────────────────
+        task.phase = "evaluating"
+        task.append_log(f"  [eval] 开始 RAGAS 评估，共 {total_q} 道题目")
+        eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)
+
+        from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths
+        eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)
+
+        return PipelineResult(
+            build_artifact_dir=build_artifact_dir.as_posix(),
+            dataset_csv=dataset_csv.as_posix(),
+            source_chunks_jsonl=source_chunks_jsonl.as_posix(),
+            total_questions=total_q,
+            parse_failures=parse_failures,
+            eval_run_id=eval_result.run_id,
+            eval_output_dir=eval_result.scenario.output_dir.as_posix(),
+            scores_csv=eval_artifact_paths.scores_csv.as_posix(),
+            summary_md=eval_artifact_paths.summary_md.as_posix(),
+        )
+
+    def _run_build(self, task: PipelineTask, req: PipelineJobRequest,
+                   docs_path: Path, artifact_dir: Path, dataset_csv: Path):
+        """Construct DatasetBuildJob and run the build phase."""
+        from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
+        from rag_eval.dataset_builder.runner import execute_dataset_build_job
+        from rag_eval.settings import EvaluationSettings
+
+        settings = EvaluationSettings()
+        job = DatasetBuildJob(
+            job_name=task.job_name,
+            input_path=docs_path,
+            input_glob="*.pdf",
+            parser_provider="aliyun_docmind",
+            failure_mode=req.failure_mode,  # type: ignore[arg-type]
+            generation_model=req.generation_model,
+            output_type="online_question_bank",
+            review_mode="draft_with_manual_review",
+            max_questions_per_document=req.max_questions_per_document,
+            max_source_chunks_per_question=req.max_source_chunks_per_question,
+            dataset_path=dataset_csv,
+            artifact_dir=artifact_dir,
+            runtime=DatasetBuildRuntime(max_documents=req.max_documents),
+        )
+        return execute_dataset_build_job(job, settings=settings)
+
+    def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,
+                  dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):
+        """Construct Scenario and run the evaluation phase."""
+        from rag_eval.execution.runner import run_scenario_from_scenario_obj
+        from rag_eval.settings import EvaluationSettings
+        from rag_eval.shared.models import (
+            AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,
+        )
+
+        settings = EvaluationSettings()
+        scenario = Scenario(
+            scenario_name=task.job_name,
+            mode="online",
+            dataset=DatasetConfig(path=dataset_csv),
+            judge_model=req.judge_model,
+            embedding_model=req.embedding_model,
+            metrics=list(req.metrics),
+            output_dir=eval_output_dir,
+            runtime=RuntimeConfig(
+                batch_size=4,
+                app_concurrency=2,
+                metric_concurrency=2,
+                max_samples=req.max_samples,
+            ),
+            app_adapter=AppAdapterConfig(
+                type="python",
+                callable="apps.siemens_pdf_qa.adapter:run",
+                static_kwargs={
+                    "source_chunks_path": source_chunks_jsonl,
+                    "model": req.answer_model,
+                },
+            ),
+            optimization_advisor=req.optimization_advisor,
+        )
+        return run_scenario_from_scenario_obj(scenario, settings=settings)
+
+
+# Module-level singleton shared by the FastAPI routes.
+pipeline_task_manager = PipelineTaskManager()
--- a/webapp/services/report_builder.py
+++ b/webapp/services/report_builder.py
@@ -177,9 +177,11 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
    w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
    rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}

-    overall_ws = compute_overall_weighted_score_mean(
-        score_rows_list, metric_weights, doc_weights
-    )
+    # 综合加权得分计算（已暂时禁用）
+    # overall_ws = compute_overall_weighted_score_mean(
+    #     score_rows_list, metric_weights, doc_weights
+    # )
+    overall_ws = None

    distributions = {
        metric: _distribution(frame, metric)
--- a/webapp/services/score_job_manager.py
+++ b/webapp/services/score_job_manager.py
@@ -149,10 +149,12 @@ class ScoreJobManager:
            # Build full scores dict (skipped = None)
            all_scores: dict[str, float | None] = {m: None for m in request.metrics}
            all_scores.update(raw_scores)
-            weighted_raw = compute_weighted_score(
-                {k: v for k, v in raw_scores.items() if v is not None}, {}
-            )
-            weighted = round(weighted_raw, 4) if weighted_raw is not None else None
+            # 综合加权得分计算（已暂时禁用）
+            # weighted_raw = compute_weighted_score(
+            #     {k: v for k, v in raw_scores.items() if v is not None}, {}
+            # )
+            # weighted = round(weighted_raw, 4) if weighted_raw is not None else None
+            weighted = None

            # Build a score row compatible with report_builder
            score_row: dict[str, Any] = {
--- a/webapp/services/session_score_manager.py
+++ b/webapp/services/session_score_manager.py
@@ -0,0 +1,452 @@
+"""Background task manager for session-grouped async RAGAS scoring.
+
+Each session groups multiple scoring calls into one shared run report:
+
+  1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
+  2. Every call: appends a new sample row to scores.csv, rewrites summary.md
+     and optimization_advice.md by re-running write_run_artifacts + run_advisor
+     over ALL accumulated rows.
+  3. The resulting run directory is picked up automatically by run_reader, so the
+     「运行列表」 and 「报告详情」 pages show the live, growing report.
+
+Concurrency model:
+  - Scoring (LLM network I/O) runs freely in the thread pool — different sessions
+    score concurrently; multiple calls to the same session also start scoring in
+    parallel.
+  - File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
+    a per-session threading.Lock, so no two calls corrupt the same session's CSV.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import threading
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
+_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
+
+# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
+_NON_METRIC_COLUMNS = {
+    "sample_id", "question", "contexts", "answer", "ground_truth",
+    "scenario", "language", "retrieval_config", "error",
+    "judge_model", "embedding_model", "run_id", "difficulty",
+    "question_type", "doc_id", "doc_name", "section_path",
+    "page_start", "page_end", "source_chunk_ids", "review_status",
+    "review_notes", "weighted_score", "sample_weight",
+}
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _sanitize_session_id(session_id: str) -> str:
+    """Convert an arbitrary session_id string to a safe directory-name fragment."""
+    return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
+
+
+class SessionScoreJobManager:
+    """Thread-pool manager for session-grouped async scoring jobs.
+
+    All calls sharing a session_id append to one shared run directory, so the
+    report detail page shows all samples and their aggregate metrics together.
+    """
+
+    def __init__(
+        self,
+        output_dir: Path = _DEFAULT_OUTPUT_DIR,
+        index_dir: Path = _DEFAULT_INDEX_DIR,
+        max_workers: int = 4,
+    ) -> None:
+        self._output_dir = Path(output_dir)
+        self._index_dir = Path(index_dir)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._index_dir.mkdir(parents=True, exist_ok=True)
+        (self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+
+        # job_id -> AsyncScoreJobStatus; guarded by _lock
+        self._job_cache: dict[str, AsyncScoreJobStatus] = {}
+        # session_id -> [job_ids in order]; guarded by _lock
+        self._session_jobs: dict[str, list[str]] = {}
+        # session_id -> per-session threading.Lock; guarded by _lock
+        self._session_locks: dict[str, threading.Lock] = {}
+        self._lock = threading.Lock()
+
+        self._load_existing()
+
+    # ------------------------------------------------------------------ #
+    # Public API
+    # ------------------------------------------------------------------ #
+
+    def session_run_id(self, session_id: str) -> str:
+        """Return the deterministic run_id for a session (also the dir name)."""
+        return f"session-{_sanitize_session_id(session_id)}"
+
+    def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
+        """Queue one scoring call for a session.
+
+        Returns (job_status, run_id). run_id is deterministic from session_id.
+        """
+        run_id = self.session_run_id(session_id)
+        job_id = uuid.uuid4().hex[:12]
+
+        status = AsyncScoreJobStatus(
+            job_id=job_id,
+            status="queued",
+            created_at=_now_iso(),
+            request_summary={
+                "question": (request.question or "")[:80],
+                "answer": (request.answer or "")[:80],
+                "metrics": list(request.metrics),
+                "judge_model": request.judge_model or "",
+                "embedding_model": request.embedding_model or "",
+                "has_contexts": bool(request.contexts),
+                "has_ground_truth": bool(request.ground_truth),
+                "session_id": session_id,
+            },
+        )
+
+        with self._lock:
+            self._job_cache[job_id] = status
+            if session_id not in self._session_jobs:
+                self._session_jobs[session_id] = []
+            self._session_jobs[session_id].append(job_id)
+
+        self._persist_job_index(status)
+        self._persist_session_index(session_id)
+        self._executor.submit(self._run, job_id, session_id, run_id, request)
+        return status, run_id
+
+    def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
+        """Return current status of one call, or None if unknown."""
+        with self._lock:
+            return self._job_cache.get(job_id)
+
+    def list_jobs(self) -> list[AsyncScoreJobStatus]:
+        """Return all session job records, newest first."""
+        with self._lock:
+            jobs = list(self._job_cache.values())
+        jobs.sort(key=lambda j: j.created_at, reverse=True)
+        return jobs
+
+    def get_session(self, session_id: str) -> SessionStatus | None:
+        """Return aggregate status for a session, or None if unknown."""
+        with self._lock:
+            job_ids = list(self._session_jobs.get(session_id) or [])
+        if not job_ids:
+            return None
+
+        run_id = self.session_run_id(session_id)
+        run_dir = self._output_dir / run_id
+
+        # Compute live metric means from the CSV (may be mid-update — best effort)
+        metric_means = self._read_metric_means(run_dir)
+
+        with self._lock:
+            jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
+
+        latest = max((j.finished_at for j in jobs if j.finished_at), default="")
+        return SessionStatus(
+            session_id=session_id,
+            run_id=run_id,
+            call_count=len(job_ids),
+            metric_means=metric_means,
+            latest_finished_at=latest,
+            jobs=sorted(jobs, key=lambda j: j.created_at),
+        )
+
+    def list_sessions(self) -> list[SessionStatus]:
+        """Return aggregate status for all known sessions."""
+        with self._lock:
+            session_ids = list(self._session_jobs.keys())
+        results = []
+        for sid in session_ids:
+            status = self.get_session(sid)
+            if status is not None:
+                results.append(status)
+        results.sort(key=lambda s: s.latest_finished_at, reverse=True)
+        return results
+
+    # ------------------------------------------------------------------ #
+    # Worker
+    # ------------------------------------------------------------------ #
+
+    def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
+        """Score one sample then append it to the session's shared run artifacts."""
+        import logging
+        logger = logging.getLogger("webapp.services.session_score_manager")
+        self._update_job(job_id, status="running")
+
+        # Lazy imports — keep web server bootable if ragas is not installed.
+        from rag_eval.advisor import run_advisor
+        from rag_eval.metrics.factory import build_models
+        from rag_eval.metrics.weights import compute_weighted_score
+        from rag_eval.reporting.writers import write_run_artifacts
+        from rag_eval.settings import EvaluationSettings
+        from rag_eval.shared.models import (
+            DatasetConfig, EvaluationResult, NormalizedSample,
+            RuntimeConfig, Scenario,
+        )
+        from rag_eval.shared.utils import utc_now_iso
+        from webapp.services.inline_scorer import inline_scorer
+
+        settings = EvaluationSettings()
+        judge_model = request.judge_model or settings.ragas_judge_model
+        embedding_model = request.embedding_model or settings.ragas_embedding_model
+        effective = request.effective_metrics()
+        requested = set(request.metrics)
+        skipped = sorted(requested - set(effective))
+
+        t0 = time.monotonic()
+
+        try:
+            # --- Scoring (can run concurrently for the same session) ----------
+            if effective:
+                raw_scores = inline_scorer.score(
+                    question=request.question,
+                    answer=request.answer,
+                    contexts=request.contexts_as_list(),
+                    ground_truth=request.ground_truth,
+                    metrics=effective,
+                    judge_model=judge_model,
+                    embedding_model=embedding_model,
+                    settings=settings,
+                )
+            else:
+                raw_scores = {}
+
+            latency_ms = int((time.monotonic() - t0) * 1000)
+            finished_at = utc_now_iso()
+
+            # Build complete scores for this sample (skipped metrics → None)
+            all_scores: dict[str, float | None] = {m: None for m in request.metrics}
+            all_scores.update(raw_scores)
+
+            # 综合加权得分计算（已暂时禁用）
+            # weighted_raw = compute_weighted_score(
+            #     {k: v for k, v in raw_scores.items() if v is not None}, {}
+            # )
+            # weighted = round(weighted_raw, 4) if weighted_raw is not None else None
+            weighted = None
+
+            # --- File I/O must be serialized per session ----------------------
+            session_lock = self._get_session_lock(session_id)
+            with session_lock:
+                run_dir = self._output_dir / run_id
+                run_dir.mkdir(parents=True, exist_ok=True)
+
+                # Read all existing rows, then append the new one
+                existing_rows = self._read_score_rows(run_dir)
+                call_number = len(existing_rows) + 1
+
+                new_row: dict[str, Any] = {
+                    "sample_id": f"session-score-{call_number}",
+                    "question": request.question,
+                    "answer": request.answer or "",
+                    "contexts": request.contexts or "",
+                    "ground_truth": request.ground_truth or "",
+                    "error": "",
+                }
+                new_row.update(all_scores)
+
+                all_rows = existing_rows + [new_row]
+
+                # Reconstruct NormalizedSample objects for write_run_artifacts metadata
+                valid_samples = [
+                    NormalizedSample(
+                        sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
+                        question=str(row.get("question", "")),
+                        answer=str(row.get("answer", "")),
+                        contexts=[
+                            part.strip()
+                            for part in str(row.get("contexts", "")).split(" |||| ")
+                            if part.strip()
+                        ],
+                        ground_truth=str(row.get("ground_truth", "")),
+                    )
+                    for i, row in enumerate(all_rows)
+                ]
+
+                # Determine all metric columns (union of all rows' metric keys)
+                all_metric_names = sorted({
+                    k for row in all_rows
+                    for k in row if k not in _NON_METRIC_COLUMNS
+                })
+
+                scenario = Scenario(
+                    scenario_name=f"session-{_sanitize_session_id(session_id)}",
+                    mode="offline",
+                    dataset=DatasetConfig(path=run_dir / "dataset.csv"),
+                    judge_model=judge_model,
+                    embedding_model=embedding_model,
+                    metrics=all_metric_names,
+                    output_dir=self._output_dir,
+                    optimization_advisor=True,
+                )
+
+                started_at_val = (
+                    existing_rows[0].get("_started_at", finished_at)
+                    if existing_rows else finished_at
+                )
+
+                result = EvaluationResult(
+                    scenario=scenario,
+                    run_id=run_id,
+                    started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
+                    finished_at=finished_at,
+                    valid_samples=valid_samples,
+                    invalid_samples=[],
+                    score_rows=all_rows,
+                )
+
+                write_run_artifacts(result)
+                logger.info(
+                    "[session_job] artifacts written  job_id=%s  session_id=%s  call=%d",
+                    job_id, session_id, call_number,
+                )
+
+                # Regenerate optimization advice over all accumulated rows
+                try:
+                    llm, _ = build_models(judge_model, embedding_model, settings)
+                    run_advisor(result, scenario, llm)
+                    logger.info("[session_job] advisor done  job_id=%s  session=%s", job_id, session_id)
+                except Exception as adv_exc:  # noqa: BLE001
+                    logger.warning(
+                        "[session_job] advisor failed  job_id=%s  err=%s", job_id, adv_exc
+                    )
+
+            self._update_job(
+                job_id,
+                status="completed",
+                finished_at=finished_at,
+                run_id=run_id,
+                scores=all_scores,
+                weighted_score=weighted,
+                latency_ms=latency_ms,
+                skipped_metrics=skipped,
+            )
+            self._persist_session_index(session_id)
+
+        except Exception as exc:  # noqa: BLE001
+            latency_ms = int((time.monotonic() - t0) * 1000)
+            import logging as _logging
+            _logging.getLogger("webapp.services.session_score_manager").error(
+                "[session_job] failed  job_id=%s  err=%s", job_id, exc
+            )
+            self._update_job(
+                job_id,
+                status="failed",
+                finished_at=_now_iso(),
+                latency_ms=latency_ms,
+                error=f"{type(exc).__name__}: {exc}",
+            )
+
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+
+    def _get_session_lock(self, session_id: str) -> threading.Lock:
+        with self._lock:
+            if session_id not in self._session_locks:
+                self._session_locks[session_id] = threading.Lock()
+            return self._session_locks[session_id]
+
+    def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
+        """Read existing scores.csv rows, returning empty list if file doesn't exist."""
+        scores_path = run_dir / "scores.csv"
+        if not scores_path.is_file():
+            return []
+        try:
+            frame = pd.read_csv(scores_path)
+            return frame.where(pd.notnull(frame), None).to_dict("records")
+        except (OSError, ValueError):
+            return []
+
+    def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
+        """Compute per-metric means from the session's scores.csv."""
+        scores_path = run_dir / "scores.csv"
+        if not scores_path.is_file():
+            return {}
+        try:
+            frame = pd.read_csv(scores_path)
+        except (OSError, ValueError):
+            return {}
+        means: dict[str, float | None] = {}
+        for col in frame.columns:
+            if col in _NON_METRIC_COLUMNS:
+                continue
+            if pd.api.types.is_numeric_dtype(frame[col]):
+                val = frame[col].mean(numeric_only=True)
+                means[col] = None if pd.isna(val) else round(float(val), 4)
+        return means
+
+    def _update_job(self, job_id: str, **kwargs: Any) -> None:
+        with self._lock:
+            existing = self._job_cache.get(job_id)
+            if existing is None:
+                return
+            updated = existing.model_copy(update=kwargs)
+            self._job_cache[job_id] = updated
+        self._persist_job_index(updated)
+
+    def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
+        """Persist a single job's status to the index directory."""
+        path = self._index_dir / f"{status.job_id}.json"
+        path.write_text(
+            json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    def _persist_session_index(self, session_id: str) -> None:
+        """Persist the session→job_ids mapping."""
+        with self._lock:
+            job_ids = list(self._session_jobs.get(session_id) or [])
+        run_id = self.session_run_id(session_id)
+        data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
+        path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
+        path.write_text(
+            json.dumps(data, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    def _load_existing(self) -> None:
+        """Restore job cache and session mappings from persisted index files on startup."""
+        # Load individual job files
+        for path in sorted(self._index_dir.glob("*.json")):
+            try:
+                data = json.loads(path.read_text(encoding="utf-8"))
+                status = AsyncScoreJobStatus.model_validate(data)
+                self._job_cache[status.job_id] = status
+            except Exception:  # noqa: BLE001
+                pass
+
+        # Load session→job_ids mappings
+        sessions_dir = self._index_dir / "_sessions"
+        if not sessions_dir.is_dir():
+            return
+        for path in sorted(sessions_dir.glob("*.json")):
+            try:
+                data = json.loads(path.read_text(encoding="utf-8"))
+                sid = data.get("session_id", "")
+                job_ids = data.get("job_ids", [])
+                if sid:
+                    self._session_jobs[sid] = job_ids
+            except Exception:  # noqa: BLE001
+                pass
+
+
+# Module-level singleton shared by FastAPI routes.
+session_score_manager = SessionScoreJobManager()
--- a/webapp/static/js/report.js
+++ b/webapp/static/js/report.js
@@ -128,17 +128,17 @@ const Report = {
      wrap.appendChild(card);
    });

-    // 综合加权得分卡片
-    const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
-    const wsCard = document.createElement("div");
-    wsCard.className = "metric-card weighted-score-card";
-    const wsCls = App.scoreClass(wsValue);
-    const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
-    wsCard.innerHTML = `
-      <div class="metric-value ${wsCls}">${wsText}</div>
-      <div class="metric-name">综合加权得分</div>
-    `;
-    wrap.appendChild(wsCard);
+    // 综合加权得分卡片（已暂时隐藏）
+    // const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
+    // const wsCard = document.createElement("div");
+    // wsCard.className = "metric-card weighted-score-card";
+    // const wsCls = App.scoreClass(wsValue);
+    // const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
+    // wsCard.innerHTML = `
+    //   <div class="metric-value ${wsCls}">${wsText}</div>
+    //   <div class="metric-name">综合加权得分</div>
+    // `;
+    // wrap.appendChild(wsCard);
  },

  // ② 分数分布直方图（可切换指标）。
--- a/webapp/static/js/score_jobs.js
+++ b/webapp/static/js/score_jobs.js
@@ -55,10 +55,11 @@ const ScoreJobs = {
          return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
        })
        .join(" ");
-      if (job.weighted_score !== null && job.weighted_score !== undefined) {
-        const cls = App.scoreClass(job.weighted_score);
-        scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
-      }
+      // 综合加权得分（已暂时隐藏）
+      // if (job.weighted_score !== null && job.weighted_score !== undefined) {
+      //   const cls = App.scoreClass(job.weighted_score);
+      //   scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
+      // }
    } else if (job.status === "failed") {
      scoreHtml = `<span style="color:var(--bad);font-size:12px">${App.escape((job.error || "").slice(0, 80))}</span>`;
    } else {