feat(session-async): add /api/score/session_async with incremental session report aggregation

- New POST /api/score/session_async endpoint: same session_id calls append to one shared report - New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records - New GET /api/score/session/jobs/{job_id}: individual call status - SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call - SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added - 24 new tests, all passing chore(weighted-score): comment out 综合加权得分 display and computation - report.js: hide 综合加权得分 card in report detail page - score_jobs.js: hide 综合 chip in async job list - report_builder.py: overall_ws=None (computation disabled) - summary.py: weighted_score summary line disabled - evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv - score.py /api/score: weighted_score always returns null - score_job_manager.py + session_score_manager.py: weighted=None - Updated 3 tests to match new behaviour (6 pre-existing failures unchanged) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00
parent e1751447df
commit 754a30ad59
36 changed files with 2004 additions and 51 deletions
--- a/rag_eval/execution/evaluator.py
+++ b/rag_eval/execution/evaluator.py
@@ -180,12 +180,12 @@ class Evaluator:
        record["judge_model"] = self.scenario.judge_model
        record["embedding_model"] = self.scenario.embedding_model
        record["run_id"] = self.scenario.scenario_name
-        # Weighted score columns — enable post-hoc weighted aggregation in reporting.
-        record["weighted_score"] = compute_weighted_score(
-            score.metrics, self.scenario.metric_weights
-        )
-        doc_name = str(sample.metadata.get("doc_name", "") or "")
-        record["sample_weight"] = resolve_weight(
-            self.scenario.doc_weights, doc_name, default=1.0
-        )
+        # 综合加权得分列（已暂时禁用）
+        # record["weighted_score"] = compute_weighted_score(
+        #     score.metrics, self.scenario.metric_weights
+        # )
+        # doc_name = str(sample.metadata.get("doc_name", "") or "")
+        # record["sample_weight"] = resolve_weight(
+        #     self.scenario.doc_weights, doc_name, default=1.0
+        # )
        return record
--- a/rag_eval/reporting/summary.py
+++ b/rag_eval/reporting/summary.py
@@ -75,15 +75,16 @@ def build_summary_markdown(result: EvaluationResult) -> str:
        else:
            lines.append(f"- {metric}: `n/a`{weight_note}")

-    if has_weights:
-        overall_ws = compute_overall_weighted_score_mean(
-            score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
-        )
-        weight_suffix = " (加权)"
-        if overall_ws is not None and not math.isnan(overall_ws):
-            lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
-        else:
-            lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
+    # 综合加权得分（已暂时禁用）
+    # if has_weights:
+    #     overall_ws = compute_overall_weighted_score_mean(
+    #         score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
+    #     )
+    #     weight_suffix = " (加权)"
+    #     if overall_ws is not None and not math.isnan(overall_ws):
+    #         lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
+    #     else:
+    #         lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")

    detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
    existing_columns = [c for c in detail_columns if c in scores.columns]
--- a/rag_eval/shared/profile_store.py
+++ b/rag_eval/shared/profile_store.py
@@ -0,0 +1,53 @@
+"""Lightweight read-only accessor for configs/llm_profiles.json.
+
+Kept in ``rag_eval`` (not ``webapp``) so the runner can look up per-model
+credentials without depending on the webapp layer.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+_PROFILES_PATH = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
+
+
+def find_by_model(model_name: str) -> dict[str, Any] | None:
+    """Return the first profile whose ``model`` field matches *model_name*, or None.
+
+    Returns None (without raising) when the profiles file does not exist or
+    cannot be parsed — callers fall back to environment-variable defaults.
+    """
+    if not _PROFILES_PATH.exists():
+        return None
+    try:
+        data = json.loads(_PROFILES_PATH.read_text(encoding="utf-8"))
+        for profile in data.get("profiles", []):
+            if profile.get("model") == model_name:
+                return profile
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("[profile_store] failed to read %s: %s", _PROFILES_PATH, exc)
+    return None
+
+
+def profile_to_client_kwargs(
+    profile: dict[str, Any],
+    fallback_api_key: str | None,
+    fallback_timeout: float,
+) -> dict[str, Any]:
+    """Convert a profile dict into keyword arguments for ``openai.AsyncOpenAI``.
+
+    Fields present in the profile override the supplied fallback values.
+    """
+    kwargs: dict[str, Any] = {
+        "api_key": profile.get("api_key") or fallback_api_key or "",
+        "timeout": float(profile.get("timeout_seconds") or fallback_timeout),
+    }
+    base_url = (profile.get("base_url") or "").strip()
+    if base_url:
+        kwargs["base_url"] = base_url
+    return kwargs