feat: add weighted_score and sample_weight columns to score rows
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,7 @@ from rag_eval.datasets.loader import load_dataset_records
|
|||||||
from rag_eval.datasets.normalizers import normalize_records
|
from rag_eval.datasets.normalizers import normalize_records
|
||||||
from rag_eval.execution.concurrency import gather_with_limit
|
from rag_eval.execution.concurrency import gather_with_limit
|
||||||
from rag_eval.metrics.pipeline import MetricPipeline
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
|
||||||
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
||||||
from rag_eval.shared.utils import utc_now_iso
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
|
||||||
@@ -171,7 +172,7 @@ class Evaluator:
|
|||||||
return valid, invalid
|
return valid, invalid
|
||||||
|
|
||||||
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
||||||
"""Combine sample data, metric results, and run metadata into one output row."""
|
"""Combine sample data, metric results, run metadata, and weight columns."""
|
||||||
record = sample.to_record()
|
record = sample.to_record()
|
||||||
record["contexts"] = sample.contexts
|
record["contexts"] = sample.contexts
|
||||||
record.update(score.metrics)
|
record.update(score.metrics)
|
||||||
@@ -179,4 +180,12 @@ class Evaluator:
|
|||||||
record["judge_model"] = self.scenario.judge_model
|
record["judge_model"] = self.scenario.judge_model
|
||||||
record["embedding_model"] = self.scenario.embedding_model
|
record["embedding_model"] = self.scenario.embedding_model
|
||||||
record["run_id"] = self.scenario.scenario_name
|
record["run_id"] = self.scenario.scenario_name
|
||||||
|
# Weighted score columns — enable post-hoc weighted aggregation in reporting.
|
||||||
|
record["weighted_score"] = compute_weighted_score(
|
||||||
|
score.metrics, self.scenario.metric_weights
|
||||||
|
)
|
||||||
|
doc_name = str(sample.metadata.get("doc_name", "") or "")
|
||||||
|
record["sample_weight"] = resolve_weight(
|
||||||
|
self.scenario.doc_weights, doc_name, default=1.0
|
||||||
|
)
|
||||||
return record
|
return record
|
||||||
|
|||||||
@@ -183,6 +183,39 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class EvaluatorAndReportingTests(unittest.TestCase):
|
class EvaluatorAndReportingTests(unittest.TestCase):
|
||||||
|
def test_merge_score_includes_weighted_score_and_sample_weight(self):
|
||||||
|
"""_merge_score adds weighted_score and sample_weight columns."""
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from rag_eval.execution.evaluator import Evaluator
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="w-test", mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
metrics=["faithfulness", "context_recall"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
|
||||||
|
doc_weights={"doc.pdf": 2.0},
|
||||||
|
)
|
||||||
|
evaluator = Evaluator(
|
||||||
|
scenario=scenario,
|
||||||
|
metric_pipeline=MagicMock(),
|
||||||
|
app_adapter=None,
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1", question="q", contexts=["ctx"],
|
||||||
|
answer="a", ground_truth="gt",
|
||||||
|
metadata={"doc_name": "doc.pdf"},
|
||||||
|
)
|
||||||
|
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
|
||||||
|
row = evaluator._merge_score(sample, score)
|
||||||
|
# (3*1.0 + 1*0.0) / (3+1) = 0.75
|
||||||
|
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
||||||
|
assert row["sample_weight"] == 2.0
|
||||||
|
|
||||||
def test_metric_pipeline_scores_sample(self) -> None:
|
def test_metric_pipeline_scores_sample(self) -> None:
|
||||||
pipeline = MetricPipeline(
|
pipeline = MetricPipeline(
|
||||||
metrics={
|
metrics={
|
||||||
|
|||||||
Reference in New Issue
Block a user