Files
siemens_ragas/rag_eval/reporting/summary.py
wangwei 754a30ad59 feat(session-async): add /api/score/session_async with incremental session report aggregation
- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing

chore(weighted-score): comment out 综合加权得分 display and computation

- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00

101 lines
3.6 KiB
Python

"""Markdown summary generation for completed evaluation runs."""
from __future__ import annotations
import math
import pandas as pd
from rag_eval.metrics.weights import (
compute_overall_weighted_score_mean,
weighted_metric_means,
)
from rag_eval.shared.models import EvaluationResult
def _table_from_frame(frame: pd.DataFrame) -> str:
"""Render a small dataframe as a fixed-width markdown-friendly text table."""
if frame.empty:
return "No rows."
columns = list(frame.columns)
rows = [[str(value) for value in row] for row in frame.astype(object).values.tolist()]
widths = []
for index, column in enumerate(columns):
column_width = len(str(column))
row_width = max((len(row[index]) for row in rows), default=0)
widths.append(max(column_width, row_width))
header = " | ".join(str(column).ljust(widths[idx]) for idx, column in enumerate(columns))
separator = "-|-".join("-" * widths[idx] for idx in range(len(columns)))
body = [
" | ".join(row[idx].ljust(widths[idx]) for idx in range(len(columns)))
for row in rows
]
return "\n".join([header, separator, *body])
def build_summary_markdown(result: EvaluationResult) -> str:
"""Build the human-readable markdown summary written for each evaluation run."""
total = len(result.valid_samples) + len(result.invalid_samples)
scores = pd.DataFrame(result.score_rows)
lines = [
f"# {result.scenario.scenario_name}",
"",
f"- run_id: `{result.run_id}`",
f"- mode: `{result.scenario.mode}`",
f"- total_samples: `{total}`",
f"- valid_samples: `{len(result.valid_samples)}`",
f"- invalid_samples: `{len(result.invalid_samples)}`",
f"- judge_model: `{result.scenario.judge_model}`",
f"- embedding_model: `{result.scenario.embedding_model}`",
"",
"## Metric Means",
"",
]
if scores.empty:
lines.append("No valid samples were scored.")
return "\n".join(lines) + "\n"
score_rows_list = scores.to_dict(orient="records")
w_means = weighted_metric_means(
score_rows_list, result.scenario.metrics, result.scenario.doc_weights
)
has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
for metric in result.scenario.metrics:
mean_value = w_means.get(metric)
w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else ""
if mean_value is not None and not math.isnan(mean_value):
lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
else:
lines.append(f"- {metric}: `n/a`{weight_note}")
# 综合加权得分(已暂时禁用)
# if has_weights:
# overall_ws = compute_overall_weighted_score_mean(
# score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
# )
# weight_suffix = " (加权)"
# if overall_ws is not None and not math.isnan(overall_ws):
# lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
# else:
# lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
existing_columns = [c for c in detail_columns if c in scores.columns]
detail = scores[existing_columns]
lines.extend([
"",
"## Per-sample Scores",
"",
"```text",
_table_from_frame(detail),
"```",
])
return "\n".join(lines) + "\n"