feat: report_builder uses weighted means; ReportData gains weighted_score_mean

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 17:16:09 +08:00
parent 835614189e
commit 36e5506e2a
3 changed files with 134 additions and 12 deletions
--- a/webapp/services/report_builder.py
+++ b/webapp/services/report_builder.py
@@ -13,6 +13,11 @@ from pathlib import Path

 import pandas as pd

+from rag_eval.metrics.weights import (
+    compute_overall_weighted_score_mean,
+    weighted_metric_means as _weighted_metric_means,
+)
+from webapp.services.run_reader import _read_weights_from_snapshot
 from webapp.services.text_utils import parse_contexts
 from webapp.models import (
    DistributionBin,
@@ -42,17 +47,6 @@ def _round_or_none(value: float | None) -> float | None:
    return round(float(value), 4)


-def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
-    """Compute the mean of each metric column across all scored samples."""
-    means: dict[str, float | None] = {}
-    for metric in metrics:
-        if metric in frame.columns:
-            means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
-        else:
-            means[metric] = None
-    return means
-
-
 def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
    """Bucket one metric's scores into fixed-width [0,1] histogram bins."""
    bins: list[DistributionBin] = []
@@ -165,6 +159,7 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
    frame = run_reader.read_scores_frame(run_dir)
    summary_markdown = run_reader.read_summary_markdown(run_dir)
    advice_markdown = run_reader.read_advice_markdown(run_dir)
+    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)

    if frame.empty or not metrics:
        return ReportData(
@@ -172,8 +167,20 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
            metric_means={metric: None for metric in metrics},
            summary_markdown=summary_markdown,
            advice_markdown=advice_markdown,
+            metric_weights=metric_weights,
+            doc_weights=doc_weights,
        )

+    score_rows_list = frame.to_dict(orient="records")
+
+    # Use weighted metric means (degrades to arithmetic mean when weights are empty).
+    w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
+    rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
+
+    overall_ws = compute_overall_weighted_score_mean(
+        score_rows_list, metric_weights, doc_weights
+    )
+
    distributions = {
        metric: _distribution(frame, metric)
        for metric in metrics
@@ -182,10 +189,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:

    return ReportData(
        metrics=metrics,
-        metric_means=_metric_means(frame, metrics),
+        metric_means=rounded_means,
        distributions=distributions,
        groupings=_groupings(frame, metrics),
        lowest_samples=_lowest_samples(frame, metrics),
        summary_markdown=summary_markdown,
        advice_markdown=advice_markdown,
+        weighted_score_mean=_round_or_none(overall_ws),
+        metric_weights=metric_weights,
+        doc_weights=doc_weights,
    )
--- a/webapp/services/run_reader.py
+++ b/webapp/services/run_reader.py
@@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
    return []


+def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]:
+    """Read metric_weights and doc_weights from a scenario snapshot if present.
+
+    Returns a (metric_weights, doc_weights) tuple of plain dicts.
+    Both default to empty dicts when the snapshot is absent or lacks the fields.
+    """
+    snapshot = run_dir / "scenario.snapshot.yaml"
+    if not snapshot.is_file():
+        return {}, {}
+    try:
+        payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
+    except (OSError, yaml.YAMLError):
+        return {}, {}
+    mw = payload.get("metric_weights") or {}
+    dw = payload.get("doc_weights") or {}
+    return (
+        {str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))},
+        {str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))},
+    )
+
+
 def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
    """Find every run directory (one that contains metadata.json) under the roots."""
    run_dirs: list[Path] = []
@@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = {
    "source_chunk_ids",
    "review_status",
    "review_notes",
+    "weighted_score",
+    "sample_weight",
 }