From 480f6d66ea42e74b708b0e96c908f53e6c9eabbf Mon Sep 17 00:00:00 2001 From: wangwei Date: Thu, 18 Jun 2026 16:59:56 +0800 Subject: [PATCH] feat: use weighted metric means and add weighted_score row to summary.md Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- rag_eval/reporting/summary.py | 59 +++++++++++++++++--------- tests/test_offline_eval.py | 78 +++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 19 deletions(-) diff --git a/rag_eval/reporting/summary.py b/rag_eval/reporting/summary.py index c56528d..13a76af 100644 --- a/rag_eval/reporting/summary.py +++ b/rag_eval/reporting/summary.py @@ -6,6 +6,10 @@ import math import pandas as pd +from rag_eval.metrics.weights import ( + compute_overall_weighted_score_mean, + weighted_metric_means, +) from rag_eval.shared.models import EvaluationResult @@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str: lines.append("No valid samples were scored.") return "\n".join(lines) + "\n" - for metric in result.scenario.metrics: - mean_value = scores[metric].mean(numeric_only=True) - if isinstance(mean_value, float) and not math.isnan(mean_value): - lines.append(f"- {metric}: `{mean_value:.4f}`") - else: - lines.append(f"- {metric}: `n/a`") - - # Keep the summary self-sufficient by including every scored sample and its errors. - detail_columns = ["sample_id", *result.scenario.metrics, "error"] - detail = scores[detail_columns] - lines.extend( - [ - "", - "## Per-sample Scores", - "", - "```text", - _table_from_frame(detail), - "```", - ] + score_rows_list = scores.to_dict(orient="records") + w_means = weighted_metric_means( + score_rows_list, result.scenario.metrics, result.scenario.doc_weights ) + + has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights) + + for metric in result.scenario.metrics: + mean_value = w_means.get(metric) + w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0 + weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else "" + if mean_value is not None and not math.isnan(mean_value): + lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}") + else: + lines.append(f"- {metric}: `n/a`{weight_note}") + + if has_weights: + overall_ws = compute_overall_weighted_score_mean( + score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights + ) + weight_suffix = " (加权)" + if overall_ws is not None and not math.isnan(overall_ws): + lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**") + else: + lines.append(f"- **weighted_score{weight_suffix}: `n/a`**") + + detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"] + existing_columns = [c for c in detail_columns if c in scores.columns] + detail = scores[existing_columns] + lines.extend([ + "", + "## Per-sample Scores", + "", + "```text", + _table_from_frame(detail), + "```", + ]) return "\n".join(lines) + "\n" diff --git a/tests/test_offline_eval.py b/tests/test_offline_eval.py index 3acc50c..3f6ce90 100644 --- a/tests/test_offline_eval.py +++ b/tests/test_offline_eval.py @@ -216,6 +216,84 @@ class EvaluatorAndReportingTests(unittest.TestCase): assert abs(row["weighted_score"] - 0.75) < 1e-4 assert row["sample_weight"] == 2.0 + def test_summary_markdown_shows_weighted_score(self): + """build_summary_markdown includes weighted_score when metric_weights set.""" + import math + from rag_eval.reporting.summary import build_summary_markdown + from rag_eval.shared.models import ( + EvaluationResult, NormalizedSample, DatasetConfig, Scenario, + ) + from pathlib import Path + scenario = Scenario( + scenario_name="ws-test", mode="offline", + dataset=DatasetConfig(path=Path("d.csv")), + judge_model="m", embedding_model="e", + metrics=["faithfulness"], + output_dir=Path("out"), + metric_weights={"faithfulness": 1.0}, + doc_weights={}, + ) + sample = NormalizedSample( + sample_id="s1", question="q", contexts=["c"], + answer="a", ground_truth="gt", + ) + result = EvaluationResult( + scenario=scenario, run_id="r1", + started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00", + valid_samples=[sample], invalid_samples=[], + score_rows=[{ + "sample_id": "s1", "faithfulness": 0.8, + "weighted_score": 0.8, "sample_weight": 1.0, + "doc_name": "", "error": "", + }], + ) + md = build_summary_markdown(result) + assert "weighted_score" in md + assert "0.8000" in md + + def test_summary_markdown_hides_weighted_score_without_weights(self): + """build_summary_markdown preserves unweighted summaries when no weights set.""" + from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario + + scenario = Scenario( + scenario_name="plain-test", + mode="offline", + dataset=DatasetConfig(path=Path("d.csv")), + judge_model="m", + embedding_model="e", + metrics=["faithfulness"], + output_dir=Path("out"), + metric_weights={}, + doc_weights={}, + ) + sample = NormalizedSample( + sample_id="s1", + question="q", + contexts=["c"], + answer="a", + ground_truth="gt", + ) + result = EvaluationResult( + scenario=scenario, + run_id="r1", + started_at="2026-01-01T00:00:00", + finished_at="2026-01-01T00:01:00", + valid_samples=[sample], + invalid_samples=[], + score_rows=[{ + "sample_id": "s1", + "faithfulness": 0.8, + "weighted_score": 0.8, + "sample_weight": 1.0, + "doc_name": "", + "error": "", + }], + ) + + md = build_summary_markdown(result) + + assert "- **weighted_score" not in md + def test_metric_pipeline_scores_sample(self) -> None: pipeline = MetricPipeline( metrics={