siemens_ragas/tests/test_webapp_report_builder.py

"""Regression tests for weighted webapp report aggregation."""

from __future__ import annotations

from pathlib import Path

import pytest

from webapp.services.report_builder import build_report
from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot


def _write_run_artifacts(run_dir: Path) -> None:
    """Create a minimal run directory with weighted scores and a snapshot."""
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "scores.csv").write_text(
        "\n".join(
            [
                "sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
                "s1,a.pdf,1.0,0.5,0.8333,3.0",
                "s2,b.pdf,0.0,0.5,0.1667,1.0",
            ]
        ),
        encoding="utf-8",
    )
    (run_dir / "summary.md").write_text("summary", encoding="utf-8")
    (run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
    (run_dir / "scenario.snapshot.yaml").write_text(
        "\n".join(
            [
                "metrics:",
                "  - faithfulness",
                "  - context_recall",
                "metric_weights:",
                "  faithfulness: 2.0",
                "  context_recall: 1.0",
                "doc_weights:",
                "  a.pdf: 3.0",
                "  b.pdf: 1.0",
            ]
        ),
        encoding="utf-8",
    )


def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
    """Snapshot weight reader returns both weight maps as plain float dicts."""
    run_dir = tmp_path / "run"
    _write_run_artifacts(run_dir)

    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)

    assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
    assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}


def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
    """Report aggregation uses weighted means and surfaces snapshot weights."""
    run_dir = tmp_path / "run"
    _write_run_artifacts(run_dir)

    report = build_report(run_dir, ["faithfulness", "context_recall"])

    assert report.metric_means == {
        "faithfulness": pytest.approx(0.75, rel=1e-4),
        "context_recall": pytest.approx(0.5, rel=1e-4),
    }
    # 综合加权得分已暂时禁用
    assert report.weighted_score_mean is None
    assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
    assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
    assert report.summary_markdown == "summary"
    assert report.advice_markdown == "advice"


def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
    """Metric inference excludes weighted helper columns from scores.csv."""
    run_dir = tmp_path / "run"
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "scores.csv").write_text(
        "\n".join(
            [
                "sample_id,doc_name,faithfulness,weighted_score,sample_weight",
                "s1,a.pdf,0.8,0.8,2.0",
            ]
        ),
        encoding="utf-8",
    )

    assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
feat: report_builder uses weighted means; ReportData gains weighted_score_mean Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> 2026-06-18 17:16:09 +08:00			`"""Regression tests for weighted webapp report aggregation."""`

			`from __future__ import annotations`

			`from pathlib import Path`

			`import pytest`

			`from webapp.services.report_builder import build_report`
			`from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot`


			`def _write_run_artifacts(run_dir: Path) -> None:`
			`"""Create a minimal run directory with weighted scores and a snapshot."""`
			`run_dir.mkdir(parents=True, exist_ok=True)`
			`(run_dir / "scores.csv").write_text(`
			`"\n".join(`
			`[`
			`"sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",`
			`"s1,a.pdf,1.0,0.5,0.8333,3.0",`
			`"s2,b.pdf,0.0,0.5,0.1667,1.0",`
			`]`
			`),`
			`encoding="utf-8",`
			`)`
			`(run_dir / "summary.md").write_text("summary", encoding="utf-8")`
			`(run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")`
			`(run_dir / "scenario.snapshot.yaml").write_text(`
			`"\n".join(`
			`[`
			`"metrics:",`
			`" - faithfulness",`
			`" - context_recall",`
			`"metric_weights:",`
			`" faithfulness: 2.0",`
			`" context_recall: 1.0",`
			`"doc_weights:",`
			`" a.pdf: 3.0",`
			`" b.pdf: 1.0",`
			`]`
			`),`
			`encoding="utf-8",`
			`)`


			`def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:`
			`"""Snapshot weight reader returns both weight maps as plain float dicts."""`
			`run_dir = tmp_path / "run"`
			`_write_run_artifacts(run_dir)`

			`metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)`

			`assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}`
			`assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}`


			`def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:`
			`"""Report aggregation uses weighted means and surfaces snapshot weights."""`
			`run_dir = tmp_path / "run"`
			`_write_run_artifacts(run_dir)`

			`report = build_report(run_dir, ["faithfulness", "context_recall"])`

			`assert report.metric_means == {`
			`"faithfulness": pytest.approx(0.75, rel=1e-4),`
			`"context_recall": pytest.approx(0.5, rel=1e-4),`
			`}`
feat(session-async): add /api/score/session_async with incremental session report aggregation - New POST /api/score/session_async endpoint: same session_id calls append to one shared report - New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records - New GET /api/score/session/jobs/{job_id}: individual call status - SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call - SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added - 24 new tests, all passing chore(weighted-score): comment out 综合加权得分 display and computation - report.js: hide 综合加权得分 card in report detail page - score_jobs.js: hide 综合 chip in async job list - report_builder.py: overall_ws=None (computation disabled) - summary.py: weighted_score summary line disabled - evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv - score.py /api/score: weighted_score always returns null - score_job_manager.py + session_score_manager.py: weighted=None - Updated 3 tests to match new behaviour (6 pre-existing failures unchanged) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> 2026-06-26 16:09:33 +08:00			`# 综合加权得分已暂时禁用`
			`assert report.weighted_score_mean is None`
feat: report_builder uses weighted means; ReportData gains weighted_score_mean Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> 2026-06-18 17:16:09 +08:00			`assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}`
			`assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}`
			`assert report.summary_markdown == "summary"`
			`assert report.advice_markdown == "advice"`


			`def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:`
			`"""Metric inference excludes weighted helper columns from scores.csv."""`
			`run_dir = tmp_path / "run"`
			`run_dir.mkdir(parents=True, exist_ok=True)`
			`(run_dir / "scores.csv").write_text(`
			`"\n".join(`
			`[`
			`"sample_id,doc_name,faithfulness,weighted_score,sample_weight",`
			`"s1,a.pdf,0.8,0.8,2.0",`
			`]`
			`),`
			`encoding="utf-8",`
			`)`

			`assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]`