feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 17:16:09 +08:00
parent 835614189e
commit 36e5506e2a
3 changed files with 134 additions and 12 deletions
--- a/tests/test_webapp_report_builder.py
+++ b/tests/test_webapp_report_builder.py
@@ -0,0 +1,89 @@
+"""Regression tests for weighted webapp report aggregation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from webapp.services.report_builder import build_report
+from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
+
+
+def _write_run_artifacts(run_dir: Path) -> None:
+    """Create a minimal run directory with weighted scores and a snapshot."""
+    run_dir.mkdir(parents=True, exist_ok=True)
+    (run_dir / "scores.csv").write_text(
+        "\n".join(
+            [
+                "sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
+                "s1,a.pdf,1.0,0.5,0.8333,3.0",
+                "s2,b.pdf,0.0,0.5,0.1667,1.0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    (run_dir / "summary.md").write_text("summary", encoding="utf-8")
+    (run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
+    (run_dir / "scenario.snapshot.yaml").write_text(
+        "\n".join(
+            [
+                "metrics:",
+                "  - faithfulness",
+                "  - context_recall",
+                "metric_weights:",
+                "  faithfulness: 2.0",
+                "  context_recall: 1.0",
+                "doc_weights:",
+                "  a.pdf: 3.0",
+                "  b.pdf: 1.0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
+    """Snapshot weight reader returns both weight maps as plain float dicts."""
+    run_dir = tmp_path / "run"
+    _write_run_artifacts(run_dir)
+
+    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
+
+    assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
+    assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
+
+
+def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
+    """Report aggregation uses weighted means and surfaces snapshot weights."""
+    run_dir = tmp_path / "run"
+    _write_run_artifacts(run_dir)
+
+    report = build_report(run_dir, ["faithfulness", "context_recall"])
+
+    assert report.metric_means == {
+        "faithfulness": pytest.approx(0.75, rel=1e-4),
+        "context_recall": pytest.approx(0.5, rel=1e-4),
+    }
+    assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
+    assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
+    assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
+    assert report.summary_markdown == "summary"
+    assert report.advice_markdown == "advice"
+
+
+def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
+    """Metric inference excludes weighted helper columns from scores.csv."""
+    run_dir = tmp_path / "run"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    (run_dir / "scores.csv").write_text(
+        "\n".join(
+            [
+                "sample_id,doc_name,faithfulness,weighted_score,sample_weight",
+                "s1,a.pdf,0.8,0.8,2.0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]