From 36e5506e2acbfc305bc4d2800d5342e444cee001 Mon Sep 17 00:00:00 2001 From: wangwei Date: Thu, 18 Jun 2026 17:16:09 +0800 Subject: [PATCH] feat: report_builder uses weighted means; ReportData gains weighted_score_mean Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_webapp_report_builder.py | 89 +++++++++++++++++++++++++++++ webapp/services/report_builder.py | 34 +++++++---- webapp/services/run_reader.py | 23 ++++++++ 3 files changed, 134 insertions(+), 12 deletions(-) create mode 100644 tests/test_webapp_report_builder.py diff --git a/tests/test_webapp_report_builder.py b/tests/test_webapp_report_builder.py new file mode 100644 index 0000000..1ed0a35 --- /dev/null +++ b/tests/test_webapp_report_builder.py @@ -0,0 +1,89 @@ +"""Regression tests for weighted webapp report aggregation.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from webapp.services.report_builder import build_report +from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot + + +def _write_run_artifacts(run_dir: Path) -> None: + """Create a minimal run directory with weighted scores and a snapshot.""" + run_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "scores.csv").write_text( + "\n".join( + [ + "sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight", + "s1,a.pdf,1.0,0.5,0.8333,3.0", + "s2,b.pdf,0.0,0.5,0.1667,1.0", + ] + ), + encoding="utf-8", + ) + (run_dir / "summary.md").write_text("summary", encoding="utf-8") + (run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8") + (run_dir / "scenario.snapshot.yaml").write_text( + "\n".join( + [ + "metrics:", + " - faithfulness", + " - context_recall", + "metric_weights:", + " faithfulness: 2.0", + " context_recall: 1.0", + "doc_weights:", + " a.pdf: 3.0", + " b.pdf: 1.0", + ] + ), + encoding="utf-8", + ) + + +def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None: + """Snapshot weight reader returns both weight maps as plain float dicts.""" + run_dir = tmp_path / "run" + _write_run_artifacts(run_dir) + + metric_weights, doc_weights = _read_weights_from_snapshot(run_dir) + + assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0} + assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0} + + +def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None: + """Report aggregation uses weighted means and surfaces snapshot weights.""" + run_dir = tmp_path / "run" + _write_run_artifacts(run_dir) + + report = build_report(run_dir, ["faithfulness", "context_recall"]) + + assert report.metric_means == { + "faithfulness": pytest.approx(0.75, rel=1e-4), + "context_recall": pytest.approx(0.5, rel=1e-4), + } + assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4) + assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0} + assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0} + assert report.summary_markdown == "summary" + assert report.advice_markdown == "advice" + + +def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None: + """Metric inference excludes weighted helper columns from scores.csv.""" + run_dir = tmp_path / "run" + run_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "scores.csv").write_text( + "\n".join( + [ + "sample_id,doc_name,faithfulness,weighted_score,sample_weight", + "s1,a.pdf,0.8,0.8,2.0", + ] + ), + encoding="utf-8", + ) + + assert _infer_metrics_from_scores(run_dir) == ["faithfulness"] diff --git a/webapp/services/report_builder.py b/webapp/services/report_builder.py index 5596315..1082401 100644 --- a/webapp/services/report_builder.py +++ b/webapp/services/report_builder.py @@ -13,6 +13,11 @@ from pathlib import Path import pandas as pd +from rag_eval.metrics.weights import ( + compute_overall_weighted_score_mean, + weighted_metric_means as _weighted_metric_means, +) +from webapp.services.run_reader import _read_weights_from_snapshot from webapp.services.text_utils import parse_contexts from webapp.models import ( DistributionBin, @@ -42,17 +47,6 @@ def _round_or_none(value: float | None) -> float | None: return round(float(value), 4) -def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]: - """Compute the mean of each metric column across all scored samples.""" - means: dict[str, float | None] = {} - for metric in metrics: - if metric in frame.columns: - means[metric] = _round_or_none(frame[metric].mean(numeric_only=True)) - else: - means[metric] = None - return means - - def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]: """Bucket one metric's scores into fixed-width [0,1] histogram bins.""" bins: list[DistributionBin] = [] @@ -165,6 +159,7 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData: frame = run_reader.read_scores_frame(run_dir) summary_markdown = run_reader.read_summary_markdown(run_dir) advice_markdown = run_reader.read_advice_markdown(run_dir) + metric_weights, doc_weights = _read_weights_from_snapshot(run_dir) if frame.empty or not metrics: return ReportData( @@ -172,8 +167,20 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData: metric_means={metric: None for metric in metrics}, summary_markdown=summary_markdown, advice_markdown=advice_markdown, + metric_weights=metric_weights, + doc_weights=doc_weights, ) + score_rows_list = frame.to_dict(orient="records") + + # Use weighted metric means (degrades to arithmetic mean when weights are empty). + w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights) + rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()} + + overall_ws = compute_overall_weighted_score_mean( + score_rows_list, metric_weights, doc_weights + ) + distributions = { metric: _distribution(frame, metric) for metric in metrics @@ -182,10 +189,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData: return ReportData( metrics=metrics, - metric_means=_metric_means(frame, metrics), + metric_means=rounded_means, distributions=distributions, groupings=_groupings(frame, metrics), lowest_samples=_lowest_samples(frame, metrics), summary_markdown=summary_markdown, advice_markdown=advice_markdown, + weighted_score_mean=_round_or_none(overall_ws), + metric_weights=metric_weights, + doc_weights=doc_weights, ) diff --git a/webapp/services/run_reader.py b/webapp/services/run_reader.py index ecec126..a8c9893 100644 --- a/webapp/services/run_reader.py +++ b/webapp/services/run_reader.py @@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]: return [] +def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]: + """Read metric_weights and doc_weights from a scenario snapshot if present. + + Returns a (metric_weights, doc_weights) tuple of plain dicts. + Both default to empty dicts when the snapshot is absent or lacks the fields. + """ + snapshot = run_dir / "scenario.snapshot.yaml" + if not snapshot.is_file(): + return {}, {} + try: + payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {} + except (OSError, yaml.YAMLError): + return {}, {} + mw = payload.get("metric_weights") or {} + dw = payload.get("doc_weights") or {} + return ( + {str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))}, + {str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))}, + ) + + def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]: """Find every run directory (one that contains metadata.json) under the roots.""" run_dirs: list[Path] = [] @@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = { "source_chunk_ids", "review_status", "review_notes", + "weighted_score", + "sample_weight", }