From d371ef7d248c826f732d976d9460995fcf8d5656 Mon Sep 17 00:00:00 2001 From: wangwei Date: Thu, 18 Jun 2026 16:53:45 +0800 Subject: [PATCH] feat: add weighted_score and sample_weight columns to score rows Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- rag_eval/execution/evaluator.py | 11 ++++++++++- tests/test_offline_eval.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/rag_eval/execution/evaluator.py b/rag_eval/execution/evaluator.py index a69454f..b2a84cc 100644 --- a/rag_eval/execution/evaluator.py +++ b/rag_eval/execution/evaluator.py @@ -12,6 +12,7 @@ from rag_eval.datasets.loader import load_dataset_records from rag_eval.datasets.normalizers import normalize_records from rag_eval.execution.concurrency import gather_with_limit from rag_eval.metrics.pipeline import MetricPipeline +from rag_eval.metrics.weights import compute_weighted_score, resolve_weight from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario from rag_eval.shared.utils import utc_now_iso @@ -171,7 +172,7 @@ class Evaluator: return valid, invalid def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]: - """Combine sample data, metric results, and run metadata into one output row.""" + """Combine sample data, metric results, run metadata, and weight columns.""" record = sample.to_record() record["contexts"] = sample.contexts record.update(score.metrics) @@ -179,4 +180,12 @@ class Evaluator: record["judge_model"] = self.scenario.judge_model record["embedding_model"] = self.scenario.embedding_model record["run_id"] = self.scenario.scenario_name + # Weighted score columns — enable post-hoc weighted aggregation in reporting. + record["weighted_score"] = compute_weighted_score( + score.metrics, self.scenario.metric_weights + ) + doc_name = str(sample.metadata.get("doc_name", "") or "") + record["sample_weight"] = resolve_weight( + self.scenario.doc_weights, doc_name, default=1.0 + ) return record diff --git a/tests/test_offline_eval.py b/tests/test_offline_eval.py index 384b665..3acc50c 100644 --- a/tests/test_offline_eval.py +++ b/tests/test_offline_eval.py @@ -183,6 +183,39 @@ class ScenarioAndDatasetTests(unittest.TestCase): class EvaluatorAndReportingTests(unittest.TestCase): + def test_merge_score_includes_weighted_score_and_sample_weight(self): + """_merge_score adds weighted_score and sample_weight columns.""" + from unittest.mock import MagicMock + from rag_eval.execution.evaluator import Evaluator + from rag_eval.shared.models import ( + MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig, + ) + + scenario = Scenario( + scenario_name="w-test", mode="offline", + dataset=DatasetConfig(path=Path("d.csv")), + judge_model="m", embedding_model="e", + metrics=["faithfulness", "context_recall"], + output_dir=Path("out"), + metric_weights={"faithfulness": 3.0, "context_recall": 1.0}, + doc_weights={"doc.pdf": 2.0}, + ) + evaluator = Evaluator( + scenario=scenario, + metric_pipeline=MagicMock(), + app_adapter=None, + ) + sample = NormalizedSample( + sample_id="s1", question="q", contexts=["ctx"], + answer="a", ground_truth="gt", + metadata={"doc_name": "doc.pdf"}, + ) + score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0}) + row = evaluator._merge_score(sample, score) + # (3*1.0 + 1*0.0) / (3+1) = 0.75 + assert abs(row["weighted_score"] - 0.75) < 1e-4 + assert row["sample_weight"] == 2.0 + def test_metric_pipeline_scores_sample(self) -> None: pipeline = MetricPipeline( metrics={