feat: use weighted metric means and add weighted_score row to summary.md

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:59:56 +08:00
parent d371ef7d24
commit 480f6d66ea
2 changed files with 118 additions and 19 deletions
--- a/rag_eval/reporting/summary.py
+++ b/rag_eval/reporting/summary.py
@@ -6,6 +6,10 @@ import math

 import pandas as pd

+from rag_eval.metrics.weights import (
+    compute_overall_weighted_score_mean,
+    weighted_metric_means,
+)
 from rag_eval.shared.models import EvaluationResult


@@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str:
        lines.append("No valid samples were scored.")
        return "\n".join(lines) + "\n"

-    for metric in result.scenario.metrics:
-        mean_value = scores[metric].mean(numeric_only=True)
-        if isinstance(mean_value, float) and not math.isnan(mean_value):
-            lines.append(f"- {metric}: `{mean_value:.4f}`")
-        else:
-            lines.append(f"- {metric}: `n/a`")
-
-    # Keep the summary self-sufficient by including every scored sample and its errors.
-    detail_columns = ["sample_id", *result.scenario.metrics, "error"]
-    detail = scores[detail_columns]
-    lines.extend(
-        [
-            "",
-            "## Per-sample Scores",
-            "",
-            "```text",
-            _table_from_frame(detail),
-            "```",
-        ]
+    score_rows_list = scores.to_dict(orient="records")
+    w_means = weighted_metric_means(
+        score_rows_list, result.scenario.metrics, result.scenario.doc_weights
    )
+
+    has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
+
+    for metric in result.scenario.metrics:
+        mean_value = w_means.get(metric)
+        w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
+        weight_note = f"  (w={w:.2f})" if result.scenario.metric_weights else ""
+        if mean_value is not None and not math.isnan(mean_value):
+            lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
+        else:
+            lines.append(f"- {metric}: `n/a`{weight_note}")
+
+    if has_weights:
+        overall_ws = compute_overall_weighted_score_mean(
+            score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
+        )
+        weight_suffix = " (加权)"
+        if overall_ws is not None and not math.isnan(overall_ws):
+            lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
+        else:
+            lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
+
+    detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
+    existing_columns = [c for c in detail_columns if c in scores.columns]
+    detail = scores[existing_columns]
+    lines.extend([
+        "",
+        "## Per-sample Scores",
+        "",
+        "```text",
+        _table_from_frame(detail),
+        "```",
+    ])
    return "\n".join(lines) + "\n"