This commit is contained in:
2026-06-27 14:31:45 +08:00
parent 1df4010acc
commit 9828b1d44c
16 changed files with 323 additions and 23 deletions

View File

@@ -37,6 +37,9 @@ GROUPING_FIELDS = ("difficulty", "question_type", "language")
# How many lowest-scoring samples to surface for manual review.
LOWEST_SAMPLE_COUNT = 10
# Metrics whose lower raw value means stronger performance.
LOWER_IS_BETTER_METRICS = {"noise_sensitivity"}
def _round_or_none(value: float | None) -> float | None:
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
@@ -105,7 +108,7 @@ def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupS
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
"""Average a single sample's available metric scores for ranking."""
values = [
float(row[metric])
(1.0 - float(row[metric])) if metric in LOWER_IS_BETTER_METRICS else float(row[metric])
for metric in metrics
if metric in row and pd.notna(row[metric])
]