From d371ef7d248c826f732d976d9460995fcf8d5656 Mon Sep 17 00:00:00 2001
From: wangwei <Wei.Wang@t-systems.com>
Date: Thu, 18 Jun 2026 16:53:45 +0800
Subject: [PATCH] feat: add weighted_score and sample_weight columns to score
 rows

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 rag_eval/execution/evaluator.py | 11 ++++++++++-
 tests/test_offline_eval.py      | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/rag_eval/execution/evaluator.py b/rag_eval/execution/evaluator.py
index a69454f..b2a84cc 100644
--- a/rag_eval/execution/evaluator.py
+++ b/rag_eval/execution/evaluator.py
@@ -12,6 +12,7 @@ from rag_eval.datasets.loader import load_dataset_records
 from rag_eval.datasets.normalizers import normalize_records
 from rag_eval.execution.concurrency import gather_with_limit
 from rag_eval.metrics.pipeline import MetricPipeline
+from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
 from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
 from rag_eval.shared.utils import utc_now_iso
 
@@ -171,7 +172,7 @@ class Evaluator:
         return valid, invalid
 
     def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
-        """Combine sample data, metric results, and run metadata into one output row."""
+        """Combine sample data, metric results, run metadata, and weight columns."""
         record = sample.to_record()
         record["contexts"] = sample.contexts
         record.update(score.metrics)
@@ -179,4 +180,12 @@ class Evaluator:
         record["judge_model"] = self.scenario.judge_model
         record["embedding_model"] = self.scenario.embedding_model
         record["run_id"] = self.scenario.scenario_name
+        # Weighted score columns — enable post-hoc weighted aggregation in reporting.
+        record["weighted_score"] = compute_weighted_score(
+            score.metrics, self.scenario.metric_weights
+        )
+        doc_name = str(sample.metadata.get("doc_name", "") or "")
+        record["sample_weight"] = resolve_weight(
+            self.scenario.doc_weights, doc_name, default=1.0
+        )
         return record
diff --git a/tests/test_offline_eval.py b/tests/test_offline_eval.py
index 384b665..3acc50c 100644
--- a/tests/test_offline_eval.py
+++ b/tests/test_offline_eval.py
@@ -183,6 +183,39 @@ class ScenarioAndDatasetTests(unittest.TestCase):
 
 
 class EvaluatorAndReportingTests(unittest.TestCase):
+    def test_merge_score_includes_weighted_score_and_sample_weight(self):
+        """_merge_score adds weighted_score and sample_weight columns."""
+        from unittest.mock import MagicMock
+        from rag_eval.execution.evaluator import Evaluator
+        from rag_eval.shared.models import (
+            MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
+        )
+
+        scenario = Scenario(
+            scenario_name="w-test", mode="offline",
+            dataset=DatasetConfig(path=Path("d.csv")),
+            judge_model="m", embedding_model="e",
+            metrics=["faithfulness", "context_recall"],
+            output_dir=Path("out"),
+            metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
+            doc_weights={"doc.pdf": 2.0},
+        )
+        evaluator = Evaluator(
+            scenario=scenario,
+            metric_pipeline=MagicMock(),
+            app_adapter=None,
+        )
+        sample = NormalizedSample(
+            sample_id="s1", question="q", contexts=["ctx"],
+            answer="a", ground_truth="gt",
+            metadata={"doc_name": "doc.pdf"},
+        )
+        score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
+        row = evaluator._merge_score(sample, score)
+        # (3*1.0 + 1*0.0) / (3+1) = 0.75
+        assert abs(row["weighted_score"] - 0.75) < 1e-4
+        assert row["sample_weight"] == 2.0
+
     def test_metric_pipeline_scores_sample(self) -> None:
         pipeline = MetricPipeline(
             metrics={