- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing
chore(weighted-score): comment out 综合加权得分 display and computation
- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
"""Regression tests for weighted webapp report aggregation."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from webapp.services.report_builder import build_report
|
|
from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
|
|
|
|
|
|
def _write_run_artifacts(run_dir: Path) -> None:
|
|
"""Create a minimal run directory with weighted scores and a snapshot."""
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
(run_dir / "scores.csv").write_text(
|
|
"\n".join(
|
|
[
|
|
"sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
|
|
"s1,a.pdf,1.0,0.5,0.8333,3.0",
|
|
"s2,b.pdf,0.0,0.5,0.1667,1.0",
|
|
]
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
|
(run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
|
|
(run_dir / "scenario.snapshot.yaml").write_text(
|
|
"\n".join(
|
|
[
|
|
"metrics:",
|
|
" - faithfulness",
|
|
" - context_recall",
|
|
"metric_weights:",
|
|
" faithfulness: 2.0",
|
|
" context_recall: 1.0",
|
|
"doc_weights:",
|
|
" a.pdf: 3.0",
|
|
" b.pdf: 1.0",
|
|
]
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
|
|
"""Snapshot weight reader returns both weight maps as plain float dicts."""
|
|
run_dir = tmp_path / "run"
|
|
_write_run_artifacts(run_dir)
|
|
|
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
|
|
|
assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
|
assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
|
|
|
|
|
def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
|
|
"""Report aggregation uses weighted means and surfaces snapshot weights."""
|
|
run_dir = tmp_path / "run"
|
|
_write_run_artifacts(run_dir)
|
|
|
|
report = build_report(run_dir, ["faithfulness", "context_recall"])
|
|
|
|
assert report.metric_means == {
|
|
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
|
"context_recall": pytest.approx(0.5, rel=1e-4),
|
|
}
|
|
# 综合加权得分已暂时禁用
|
|
assert report.weighted_score_mean is None
|
|
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
|
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
|
assert report.summary_markdown == "summary"
|
|
assert report.advice_markdown == "advice"
|
|
|
|
|
|
def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
|
|
"""Metric inference excludes weighted helper columns from scores.csv."""
|
|
run_dir = tmp_path / "run"
|
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
(run_dir / "scores.csv").write_text(
|
|
"\n".join(
|
|
[
|
|
"sample_id,doc_name,faithfulness,weighted_score,sample_weight",
|
|
"s1,a.pdf,0.8,0.8,2.0",
|
|
]
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|