feat: use weighted metric means and add weighted_score row to summary.md
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -216,6 +216,84 @@ class EvaluatorAndReportingTests(unittest.TestCase):
|
||||
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
||||
assert row["sample_weight"] == 2.0
|
||||
|
||||
def test_summary_markdown_shows_weighted_score(self):
|
||||
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
||||
import math
|
||||
from rag_eval.reporting.summary import build_summary_markdown
|
||||
from rag_eval.shared.models import (
|
||||
EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
|
||||
)
|
||||
from pathlib import Path
|
||||
scenario = Scenario(
|
||||
scenario_name="ws-test", mode="offline",
|
||||
dataset=DatasetConfig(path=Path("d.csv")),
|
||||
judge_model="m", embedding_model="e",
|
||||
metrics=["faithfulness"],
|
||||
output_dir=Path("out"),
|
||||
metric_weights={"faithfulness": 1.0},
|
||||
doc_weights={},
|
||||
)
|
||||
sample = NormalizedSample(
|
||||
sample_id="s1", question="q", contexts=["c"],
|
||||
answer="a", ground_truth="gt",
|
||||
)
|
||||
result = EvaluationResult(
|
||||
scenario=scenario, run_id="r1",
|
||||
started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
|
||||
valid_samples=[sample], invalid_samples=[],
|
||||
score_rows=[{
|
||||
"sample_id": "s1", "faithfulness": 0.8,
|
||||
"weighted_score": 0.8, "sample_weight": 1.0,
|
||||
"doc_name": "", "error": "",
|
||||
}],
|
||||
)
|
||||
md = build_summary_markdown(result)
|
||||
assert "weighted_score" in md
|
||||
assert "0.8000" in md
|
||||
|
||||
def test_summary_markdown_hides_weighted_score_without_weights(self):
|
||||
"""build_summary_markdown preserves unweighted summaries when no weights set."""
|
||||
from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
|
||||
|
||||
scenario = Scenario(
|
||||
scenario_name="plain-test",
|
||||
mode="offline",
|
||||
dataset=DatasetConfig(path=Path("d.csv")),
|
||||
judge_model="m",
|
||||
embedding_model="e",
|
||||
metrics=["faithfulness"],
|
||||
output_dir=Path("out"),
|
||||
metric_weights={},
|
||||
doc_weights={},
|
||||
)
|
||||
sample = NormalizedSample(
|
||||
sample_id="s1",
|
||||
question="q",
|
||||
contexts=["c"],
|
||||
answer="a",
|
||||
ground_truth="gt",
|
||||
)
|
||||
result = EvaluationResult(
|
||||
scenario=scenario,
|
||||
run_id="r1",
|
||||
started_at="2026-01-01T00:00:00",
|
||||
finished_at="2026-01-01T00:01:00",
|
||||
valid_samples=[sample],
|
||||
invalid_samples=[],
|
||||
score_rows=[{
|
||||
"sample_id": "s1",
|
||||
"faithfulness": 0.8,
|
||||
"weighted_score": 0.8,
|
||||
"sample_weight": 1.0,
|
||||
"doc_name": "",
|
||||
"error": "",
|
||||
}],
|
||||
)
|
||||
|
||||
md = build_summary_markdown(result)
|
||||
|
||||
assert "- **weighted_score" not in md
|
||||
|
||||
def test_metric_pipeline_scores_sample(self) -> None:
|
||||
pipeline = MetricPipeline(
|
||||
metrics={
|
||||
|
||||
Reference in New Issue
Block a user