feat: use weighted metric means and add weighted_score row to summary.md
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,10 @@ import math
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
weighted_metric_means,
|
||||||
|
)
|
||||||
from rag_eval.shared.models import EvaluationResult
|
from rag_eval.shared.models import EvaluationResult
|
||||||
|
|
||||||
|
|
||||||
@@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str:
|
|||||||
lines.append("No valid samples were scored.")
|
lines.append("No valid samples were scored.")
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
for metric in result.scenario.metrics:
|
score_rows_list = scores.to_dict(orient="records")
|
||||||
mean_value = scores[metric].mean(numeric_only=True)
|
w_means = weighted_metric_means(
|
||||||
if isinstance(mean_value, float) and not math.isnan(mean_value):
|
score_rows_list, result.scenario.metrics, result.scenario.doc_weights
|
||||||
lines.append(f"- {metric}: `{mean_value:.4f}`")
|
|
||||||
else:
|
|
||||||
lines.append(f"- {metric}: `n/a`")
|
|
||||||
|
|
||||||
# Keep the summary self-sufficient by including every scored sample and its errors.
|
|
||||||
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
|
|
||||||
detail = scores[detail_columns]
|
|
||||||
lines.extend(
|
|
||||||
[
|
|
||||||
"",
|
|
||||||
"## Per-sample Scores",
|
|
||||||
"",
|
|
||||||
"```text",
|
|
||||||
_table_from_frame(detail),
|
|
||||||
"```",
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
|
||||||
|
|
||||||
|
for metric in result.scenario.metrics:
|
||||||
|
mean_value = w_means.get(metric)
|
||||||
|
w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
|
||||||
|
weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else ""
|
||||||
|
if mean_value is not None and not math.isnan(mean_value):
|
||||||
|
lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
|
||||||
|
else:
|
||||||
|
lines.append(f"- {metric}: `n/a`{weight_note}")
|
||||||
|
|
||||||
|
if has_weights:
|
||||||
|
overall_ws = compute_overall_weighted_score_mean(
|
||||||
|
score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
|
||||||
|
)
|
||||||
|
weight_suffix = " (加权)"
|
||||||
|
if overall_ws is not None and not math.isnan(overall_ws):
|
||||||
|
lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
|
||||||
|
else:
|
||||||
|
lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
|
||||||
|
|
||||||
|
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
|
||||||
|
existing_columns = [c for c in detail_columns if c in scores.columns]
|
||||||
|
detail = scores[existing_columns]
|
||||||
|
lines.extend([
|
||||||
|
"",
|
||||||
|
"## Per-sample Scores",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
_table_from_frame(detail),
|
||||||
|
"```",
|
||||||
|
])
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|||||||
@@ -216,6 +216,84 @@ class EvaluatorAndReportingTests(unittest.TestCase):
|
|||||||
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
||||||
assert row["sample_weight"] == 2.0
|
assert row["sample_weight"] == 2.0
|
||||||
|
|
||||||
|
def test_summary_markdown_shows_weighted_score(self):
|
||||||
|
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
||||||
|
import math
|
||||||
|
from rag_eval.reporting.summary import build_summary_markdown
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
|
||||||
|
)
|
||||||
|
from pathlib import Path
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="ws-test", mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={"faithfulness": 1.0},
|
||||||
|
doc_weights={},
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1", question="q", contexts=["c"],
|
||||||
|
answer="a", ground_truth="gt",
|
||||||
|
)
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario, run_id="r1",
|
||||||
|
started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[sample], invalid_samples=[],
|
||||||
|
score_rows=[{
|
||||||
|
"sample_id": "s1", "faithfulness": 0.8,
|
||||||
|
"weighted_score": 0.8, "sample_weight": 1.0,
|
||||||
|
"doc_name": "", "error": "",
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
md = build_summary_markdown(result)
|
||||||
|
assert "weighted_score" in md
|
||||||
|
assert "0.8000" in md
|
||||||
|
|
||||||
|
def test_summary_markdown_hides_weighted_score_without_weights(self):
|
||||||
|
"""build_summary_markdown preserves unweighted summaries when no weights set."""
|
||||||
|
from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="plain-test",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m",
|
||||||
|
embedding_model="e",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={},
|
||||||
|
doc_weights={},
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1",
|
||||||
|
question="q",
|
||||||
|
contexts=["c"],
|
||||||
|
answer="a",
|
||||||
|
ground_truth="gt",
|
||||||
|
)
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id="r1",
|
||||||
|
started_at="2026-01-01T00:00:00",
|
||||||
|
finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[sample],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[{
|
||||||
|
"sample_id": "s1",
|
||||||
|
"faithfulness": 0.8,
|
||||||
|
"weighted_score": 0.8,
|
||||||
|
"sample_weight": 1.0,
|
||||||
|
"doc_name": "",
|
||||||
|
"error": "",
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
|
||||||
|
md = build_summary_markdown(result)
|
||||||
|
|
||||||
|
assert "- **weighted_score" not in md
|
||||||
|
|
||||||
def test_metric_pipeline_scores_sample(self) -> None:
|
def test_metric_pipeline_scores_sample(self) -> None:
|
||||||
pipeline = MetricPipeline(
|
pipeline = MetricPipeline(
|
||||||
metrics={
|
metrics={
|
||||||
|
|||||||
Reference in New Issue
Block a user