Files
siemens_ragas/webapp/services/report_builder.py
wangwei 754a30ad59 feat(session-async): add /api/score/session_async with incremental session report aggregation
- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing

chore(weighted-score): comment out 综合加权得分 display and computation

- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00

204 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Aggregate a run's per-sample scores into the report payload for the UI.
All aggregation reads only the standard scores.csv produced by the reporting
layer, plus the metric list resolved by run_reader. The output mirrors the
report detail page: metric means, per-metric distribution histograms, grouped
means by difficulty / question_type, and the lowest-scoring samples for review.
"""
from __future__ import annotations
import math
from pathlib import Path
import pandas as pd
from rag_eval.metrics.weights import (
compute_overall_weighted_score_mean,
weighted_metric_means as _weighted_metric_means,
)
from webapp.services.run_reader import _read_weights_from_snapshot
from webapp.services.text_utils import parse_contexts
from webapp.models import (
DistributionBin,
GroupStat,
ReportData,
SampleScore,
)
from webapp.services import run_reader
# Number of equal-width buckets used for metric score histograms.
DISTRIBUTION_BIN_COUNT = 5
# Metadata columns that we group samples by when present in the data.
GROUPING_FIELDS = ("difficulty", "question_type", "language")
# How many lowest-scoring samples to surface for manual review.
LOWEST_SAMPLE_COUNT = 10
def _round_or_none(value: float | None) -> float | None:
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
if value is None:
return None
if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
return None
return round(float(value), 4)
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
bins: list[DistributionBin] = []
if metric not in frame.columns:
return bins
series = pd.to_numeric(frame[metric], errors="coerce").dropna()
width = 1.0 / DISTRIBUTION_BIN_COUNT
for index in range(DISTRIBUTION_BIN_COUNT):
lower = index * width
upper = (index + 1) * width
# Include the right edge in the final bin so 1.0 is counted.
if index == DISTRIBUTION_BIN_COUNT - 1:
mask = (series >= lower) & (series <= upper)
else:
mask = (series >= lower) & (series < upper)
bins.append(
DistributionBin(
label=f"{lower:.1f}{upper:.1f}",
lower=round(lower, 2),
upper=round(upper, 2),
count=int(mask.sum()),
)
)
return bins
def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
"""Compute per-group metric means for each available grouping field."""
groupings: dict[str, list[GroupStat]] = {}
for field in GROUPING_FIELDS:
if field not in frame.columns:
continue
# Skip fields that are entirely empty so the UI does not render noise.
non_empty = frame[field].astype(str).str.strip().replace("nan", "")
if non_empty.eq("").all():
continue
stats: list[GroupStat] = []
for key, group in frame.groupby(frame[field].astype(str)):
key_text = str(key).strip()
if not key_text or key_text == "nan":
continue
means = {
metric: _round_or_none(group[metric].mean(numeric_only=True))
for metric in metrics
if metric in group.columns
}
stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
if stats:
stats.sort(key=lambda item: item.key)
groupings[field] = stats
return groupings
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
"""Average a single sample's available metric scores for ranking."""
values = [
float(row[metric])
for metric in metrics
if metric in row and pd.notna(row[metric])
]
if not values:
return None
return sum(values) / len(values)
def _cell_text(row: pd.Series, column: str) -> str:
"""Safely read a string cell, returning '' for missing or NaN values."""
if column not in row or pd.isna(row[column]):
return ""
return str(row[column]).strip()
def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
"""Select and shape the lowest-scoring samples for the review table."""
if frame.empty:
return []
enriched: list[tuple[float, SampleScore]] = []
for _, row in frame.iterrows():
mean_score = _sample_mean(row, metrics)
sample = SampleScore(
sample_id=_cell_text(row, "sample_id") or "",
question=_cell_text(row, "question"),
contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
answer=_cell_text(row, "answer"),
ground_truth=_cell_text(row, "ground_truth"),
language=_cell_text(row, "language"),
difficulty=_cell_text(row, "difficulty"),
question_type=_cell_text(row, "question_type"),
metrics={
metric: _round_or_none(float(row[metric]))
for metric in metrics
if metric in row and pd.notna(row[metric])
},
mean_score=_round_or_none(mean_score),
error=_cell_text(row, "error"),
)
# Samples without any score sort last (treated as worst for review).
sort_key = mean_score if mean_score is not None else -1.0
enriched.append((sort_key, sample))
enriched.sort(key=lambda item: item[0])
return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
"""Build the full aggregated report payload for one run directory."""
frame = run_reader.read_scores_frame(run_dir)
summary_markdown = run_reader.read_summary_markdown(run_dir)
advice_markdown = run_reader.read_advice_markdown(run_dir)
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
if frame.empty or not metrics:
return ReportData(
metrics=metrics,
metric_means={metric: None for metric in metrics},
summary_markdown=summary_markdown,
advice_markdown=advice_markdown,
metric_weights=metric_weights,
doc_weights=doc_weights,
)
score_rows_list = frame.to_dict(orient="records")
# Use weighted metric means (degrades to arithmetic mean when weights are empty).
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
# 综合加权得分计算(已暂时禁用)
# overall_ws = compute_overall_weighted_score_mean(
# score_rows_list, metric_weights, doc_weights
# )
overall_ws = None
distributions = {
metric: _distribution(frame, metric)
for metric in metrics
if metric in frame.columns
}
return ReportData(
metrics=metrics,
metric_means=rounded_means,
distributions=distributions,
groupings=_groupings(frame, metrics),
lowest_samples=_lowest_samples(frame, metrics),
summary_markdown=summary_markdown,
advice_markdown=advice_markdown,
weighted_score_mean=_round_or_none(overall_ws),
metric_weights=metric_weights,
doc_weights=doc_weights,
)