feat: use weighted metric means and add weighted_score row to summary.md

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:59:56 +08:00
parent d371ef7d24
commit 480f6d66ea
2 changed files with 118 additions and 19 deletions
--- a/tests/test_offline_eval.py
+++ b/tests/test_offline_eval.py
@@ -216,6 +216,84 @@ class EvaluatorAndReportingTests(unittest.TestCase):
        assert abs(row["weighted_score"] - 0.75) < 1e-4
        assert row["sample_weight"] == 2.0

+    def test_summary_markdown_shows_weighted_score(self):
+        """build_summary_markdown includes weighted_score when metric_weights set."""
+        import math
+        from rag_eval.reporting.summary import build_summary_markdown
+        from rag_eval.shared.models import (
+            EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
+        )
+        from pathlib import Path
+        scenario = Scenario(
+            scenario_name="ws-test", mode="offline",
+            dataset=DatasetConfig(path=Path("d.csv")),
+            judge_model="m", embedding_model="e",
+            metrics=["faithfulness"],
+            output_dir=Path("out"),
+            metric_weights={"faithfulness": 1.0},
+            doc_weights={},
+        )
+        sample = NormalizedSample(
+            sample_id="s1", question="q", contexts=["c"],
+            answer="a", ground_truth="gt",
+        )
+        result = EvaluationResult(
+            scenario=scenario, run_id="r1",
+            started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
+            valid_samples=[sample], invalid_samples=[],
+            score_rows=[{
+                "sample_id": "s1", "faithfulness": 0.8,
+                "weighted_score": 0.8, "sample_weight": 1.0,
+                "doc_name": "", "error": "",
+            }],
+        )
+        md = build_summary_markdown(result)
+        assert "weighted_score" in md
+        assert "0.8000" in md
+
+    def test_summary_markdown_hides_weighted_score_without_weights(self):
+        """build_summary_markdown preserves unweighted summaries when no weights set."""
+        from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
+
+        scenario = Scenario(
+            scenario_name="plain-test",
+            mode="offline",
+            dataset=DatasetConfig(path=Path("d.csv")),
+            judge_model="m",
+            embedding_model="e",
+            metrics=["faithfulness"],
+            output_dir=Path("out"),
+            metric_weights={},
+            doc_weights={},
+        )
+        sample = NormalizedSample(
+            sample_id="s1",
+            question="q",
+            contexts=["c"],
+            answer="a",
+            ground_truth="gt",
+        )
+        result = EvaluationResult(
+            scenario=scenario,
+            run_id="r1",
+            started_at="2026-01-01T00:00:00",
+            finished_at="2026-01-01T00:01:00",
+            valid_samples=[sample],
+            invalid_samples=[],
+            score_rows=[{
+                "sample_id": "s1",
+                "faithfulness": 0.8,
+                "weighted_score": 0.8,
+                "sample_weight": 1.0,
+                "doc_name": "",
+                "error": "",
+            }],
+        )
+
+        md = build_summary_markdown(result)
+
+        assert "- **weighted_score" not in md
+
    def test_metric_pipeline_scores_sample(self) -> None:
        pipeline = MetricPipeline(
            metrics={