feat: use weighted metric means and add weighted_score row to summary.md
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,10 @@ import math
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from rag_eval.metrics.weights import (
|
||||
compute_overall_weighted_score_mean,
|
||||
weighted_metric_means,
|
||||
)
|
||||
from rag_eval.shared.models import EvaluationResult
|
||||
|
||||
|
||||
@@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str:
|
||||
lines.append("No valid samples were scored.")
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
for metric in result.scenario.metrics:
|
||||
mean_value = scores[metric].mean(numeric_only=True)
|
||||
if isinstance(mean_value, float) and not math.isnan(mean_value):
|
||||
lines.append(f"- {metric}: `{mean_value:.4f}`")
|
||||
else:
|
||||
lines.append(f"- {metric}: `n/a`")
|
||||
|
||||
# Keep the summary self-sufficient by including every scored sample and its errors.
|
||||
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
|
||||
detail = scores[detail_columns]
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Per-sample Scores",
|
||||
"",
|
||||
"```text",
|
||||
_table_from_frame(detail),
|
||||
"```",
|
||||
]
|
||||
score_rows_list = scores.to_dict(orient="records")
|
||||
w_means = weighted_metric_means(
|
||||
score_rows_list, result.scenario.metrics, result.scenario.doc_weights
|
||||
)
|
||||
|
||||
has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
|
||||
|
||||
for metric in result.scenario.metrics:
|
||||
mean_value = w_means.get(metric)
|
||||
w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
|
||||
weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else ""
|
||||
if mean_value is not None and not math.isnan(mean_value):
|
||||
lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
|
||||
else:
|
||||
lines.append(f"- {metric}: `n/a`{weight_note}")
|
||||
|
||||
if has_weights:
|
||||
overall_ws = compute_overall_weighted_score_mean(
|
||||
score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
|
||||
)
|
||||
weight_suffix = " (加权)"
|
||||
if overall_ws is not None and not math.isnan(overall_ws):
|
||||
lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
|
||||
else:
|
||||
lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
|
||||
|
||||
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
|
||||
existing_columns = [c for c in detail_columns if c in scores.columns]
|
||||
detail = scores[existing_columns]
|
||||
lines.extend([
|
||||
"",
|
||||
"## Per-sample Scores",
|
||||
"",
|
||||
"```text",
|
||||
_table_from_frame(detail),
|
||||
"```",
|
||||
])
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
Reference in New Issue
Block a user