feat: report_builder uses weighted means; ReportData gains weighted_score_mean
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
89
tests/test_webapp_report_builder.py
Normal file
89
tests/test_webapp_report_builder.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""Regression tests for weighted webapp report aggregation."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from webapp.services.report_builder import build_report
|
||||||
|
from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
def _write_run_artifacts(run_dir: Path) -> None:
|
||||||
|
"""Create a minimal run directory with weighted scores and a snapshot."""
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
|
||||||
|
"s1,a.pdf,1.0,0.5,0.8333,3.0",
|
||||||
|
"s2,b.pdf,0.0,0.5,0.1667,1.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||||
|
(run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
|
||||||
|
(run_dir / "scenario.snapshot.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"metrics:",
|
||||||
|
" - faithfulness",
|
||||||
|
" - context_recall",
|
||||||
|
"metric_weights:",
|
||||||
|
" faithfulness: 2.0",
|
||||||
|
" context_recall: 1.0",
|
||||||
|
"doc_weights:",
|
||||||
|
" a.pdf: 3.0",
|
||||||
|
" b.pdf: 1.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
|
||||||
|
"""Snapshot weight reader returns both weight maps as plain float dicts."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
_write_run_artifacts(run_dir)
|
||||||
|
|
||||||
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
||||||
|
|
||||||
|
assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
|
assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
|
||||||
|
"""Report aggregation uses weighted means and surfaces snapshot weights."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
_write_run_artifacts(run_dir)
|
||||||
|
|
||||||
|
report = build_report(run_dir, ["faithfulness", "context_recall"])
|
||||||
|
|
||||||
|
assert report.metric_means == {
|
||||||
|
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
||||||
|
"context_recall": pytest.approx(0.5, rel=1e-4),
|
||||||
|
}
|
||||||
|
assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
|
||||||
|
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
|
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
assert report.summary_markdown == "summary"
|
||||||
|
assert report.advice_markdown == "advice"
|
||||||
|
|
||||||
|
|
||||||
|
def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
|
||||||
|
"""Metric inference excludes weighted helper columns from scores.csv."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,doc_name,faithfulness,weighted_score,sample_weight",
|
||||||
|
"s1,a.pdf,0.8,0.8,2.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
||||||
@@ -13,6 +13,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
weighted_metric_means as _weighted_metric_means,
|
||||||
|
)
|
||||||
|
from webapp.services.run_reader import _read_weights_from_snapshot
|
||||||
from webapp.services.text_utils import parse_contexts
|
from webapp.services.text_utils import parse_contexts
|
||||||
from webapp.models import (
|
from webapp.models import (
|
||||||
DistributionBin,
|
DistributionBin,
|
||||||
@@ -42,17 +47,6 @@ def _round_or_none(value: float | None) -> float | None:
|
|||||||
return round(float(value), 4)
|
return round(float(value), 4)
|
||||||
|
|
||||||
|
|
||||||
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
|
|
||||||
"""Compute the mean of each metric column across all scored samples."""
|
|
||||||
means: dict[str, float | None] = {}
|
|
||||||
for metric in metrics:
|
|
||||||
if metric in frame.columns:
|
|
||||||
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
|
|
||||||
else:
|
|
||||||
means[metric] = None
|
|
||||||
return means
|
|
||||||
|
|
||||||
|
|
||||||
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
||||||
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
||||||
bins: list[DistributionBin] = []
|
bins: list[DistributionBin] = []
|
||||||
@@ -165,6 +159,7 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
frame = run_reader.read_scores_frame(run_dir)
|
frame = run_reader.read_scores_frame(run_dir)
|
||||||
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
||||||
advice_markdown = run_reader.read_advice_markdown(run_dir)
|
advice_markdown = run_reader.read_advice_markdown(run_dir)
|
||||||
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
||||||
|
|
||||||
if frame.empty or not metrics:
|
if frame.empty or not metrics:
|
||||||
return ReportData(
|
return ReportData(
|
||||||
@@ -172,6 +167,18 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
metric_means={metric: None for metric in metrics},
|
metric_means={metric: None for metric in metrics},
|
||||||
summary_markdown=summary_markdown,
|
summary_markdown=summary_markdown,
|
||||||
advice_markdown=advice_markdown,
|
advice_markdown=advice_markdown,
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
|
)
|
||||||
|
|
||||||
|
score_rows_list = frame.to_dict(orient="records")
|
||||||
|
|
||||||
|
# Use weighted metric means (degrades to arithmetic mean when weights are empty).
|
||||||
|
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
|
||||||
|
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
|
||||||
|
|
||||||
|
overall_ws = compute_overall_weighted_score_mean(
|
||||||
|
score_rows_list, metric_weights, doc_weights
|
||||||
)
|
)
|
||||||
|
|
||||||
distributions = {
|
distributions = {
|
||||||
@@ -182,10 +189,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
|
|
||||||
return ReportData(
|
return ReportData(
|
||||||
metrics=metrics,
|
metrics=metrics,
|
||||||
metric_means=_metric_means(frame, metrics),
|
metric_means=rounded_means,
|
||||||
distributions=distributions,
|
distributions=distributions,
|
||||||
groupings=_groupings(frame, metrics),
|
groupings=_groupings(frame, metrics),
|
||||||
lowest_samples=_lowest_samples(frame, metrics),
|
lowest_samples=_lowest_samples(frame, metrics),
|
||||||
summary_markdown=summary_markdown,
|
summary_markdown=summary_markdown,
|
||||||
advice_markdown=advice_markdown,
|
advice_markdown=advice_markdown,
|
||||||
|
weighted_score_mean=_round_or_none(overall_ws),
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]:
|
||||||
|
"""Read metric_weights and doc_weights from a scenario snapshot if present.
|
||||||
|
|
||||||
|
Returns a (metric_weights, doc_weights) tuple of plain dicts.
|
||||||
|
Both default to empty dicts when the snapshot is absent or lacks the fields.
|
||||||
|
"""
|
||||||
|
snapshot = run_dir / "scenario.snapshot.yaml"
|
||||||
|
if not snapshot.is_file():
|
||||||
|
return {}, {}
|
||||||
|
try:
|
||||||
|
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
|
||||||
|
except (OSError, yaml.YAMLError):
|
||||||
|
return {}, {}
|
||||||
|
mw = payload.get("metric_weights") or {}
|
||||||
|
dw = payload.get("doc_weights") or {}
|
||||||
|
return (
|
||||||
|
{str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))},
|
||||||
|
{str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||||
"""Find every run directory (one that contains metadata.json) under the roots."""
|
"""Find every run directory (one that contains metadata.json) under the roots."""
|
||||||
run_dirs: list[Path] = []
|
run_dirs: list[Path] = []
|
||||||
@@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = {
|
|||||||
"source_chunk_ids",
|
"source_chunk_ids",
|
||||||
"review_status",
|
"review_status",
|
||||||
"review_notes",
|
"review_notes",
|
||||||
|
"weighted_score",
|
||||||
|
"sample_weight",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user