feat(session-async): add /api/score/session_async with incremental session report aggregation

- New POST /api/score/session_async endpoint: same session_id calls append to one shared report - New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records - New GET /api/score/session/jobs/{job_id}: individual call status - SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call - SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added - 24 new tests, all passing chore(weighted-score): comment out 综合加权得分 display and computation - report.js: hide 综合加权得分 card in report detail page - score_jobs.js: hide 综合 chip in async job list - report_builder.py: overall_ws=None (computation disabled) - summary.py: weighted_score summary line disabled - evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv - score.py /api/score: weighted_score always returns null - score_job_manager.py + session_score_manager.py: weighted=None - Updated 3 tests to match new behaviour (6 pre-existing failures unchanged) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00
parent e1751447df
commit 754a30ad59
36 changed files with 2004 additions and 51 deletions
--- a/tests/test_offline_eval.py
+++ b/tests/test_offline_eval.py
@@ -184,7 +184,7 @@ class ScenarioAndDatasetTests(unittest.TestCase):

 class EvaluatorAndReportingTests(unittest.TestCase):
    def test_merge_score_includes_weighted_score_and_sample_weight(self):
-        """_merge_score adds weighted_score and sample_weight columns."""
+        """_merge_score no longer adds weighted_score/sample_weight (feature disabled)."""
        from unittest.mock import MagicMock
        from rag_eval.execution.evaluator import Evaluator
        from rag_eval.shared.models import (
@@ -212,9 +212,11 @@ class EvaluatorAndReportingTests(unittest.TestCase):
        )
        score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
        row = evaluator._merge_score(sample, score)
-        # (3*1.0 + 1*0.0) / (3+1) = 0.75
-        assert abs(row["weighted_score"] - 0.75) < 1e-4
-        assert row["sample_weight"] == 2.0
+        # 综合加权得分已暂时禁用，weighted_score 和 sample_weight 不再写入
+        assert "weighted_score" not in row
+        assert "sample_weight" not in row
+        assert row["faithfulness"] == 1.0
+        assert row["context_recall"] == 0.0

    def test_summary_markdown_shows_weighted_score(self):
        """build_summary_markdown includes weighted_score when metric_weights set."""