feat(session-async): add /api/score/session_async with incremental session report aggregation
- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing
chore(weighted-score): comment out 综合加权得分 display and computation
- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -184,7 +184,7 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
||||
|
||||
class EvaluatorAndReportingTests(unittest.TestCase):
|
||||
def test_merge_score_includes_weighted_score_and_sample_weight(self):
|
||||
"""_merge_score adds weighted_score and sample_weight columns."""
|
||||
"""_merge_score no longer adds weighted_score/sample_weight (feature disabled)."""
|
||||
from unittest.mock import MagicMock
|
||||
from rag_eval.execution.evaluator import Evaluator
|
||||
from rag_eval.shared.models import (
|
||||
@@ -212,9 +212,11 @@ class EvaluatorAndReportingTests(unittest.TestCase):
|
||||
)
|
||||
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
|
||||
row = evaluator._merge_score(sample, score)
|
||||
# (3*1.0 + 1*0.0) / (3+1) = 0.75
|
||||
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
||||
assert row["sample_weight"] == 2.0
|
||||
# 综合加权得分已暂时禁用,weighted_score 和 sample_weight 不再写入
|
||||
assert "weighted_score" not in row
|
||||
assert "sample_weight" not in row
|
||||
assert row["faithfulness"] == 1.0
|
||||
assert row["context_recall"] == 0.0
|
||||
|
||||
def test_summary_markdown_shows_weighted_score(self):
|
||||
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
||||
|
||||
280
tests/test_pipeline.py
Normal file
280
tests/test_pipeline.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""Tests for the end-to-end pipeline API and pipeline task manager."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
# ── fixtures ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture()
|
||||
def client(tmp_path, monkeypatch):
|
||||
"""TestClient with a fresh PipelineTaskManager backed by tmp_path outputs."""
|
||||
import webapp.services.pipeline_task_manager as mgr_mod
|
||||
from webapp.services.pipeline_task_manager import PipelineTaskManager
|
||||
|
||||
fresh_mgr = PipelineTaskManager(max_workers=2)
|
||||
monkeypatch.setattr(mgr_mod, "pipeline_task_manager", fresh_mgr)
|
||||
monkeypatch.setattr(mgr_mod, "_PIPELINE_OUTPUT_ROOT", tmp_path / "pipeline")
|
||||
|
||||
import webapp.api.pipeline as api_mod
|
||||
monkeypatch.setattr(api_mod, "pipeline_task_manager", fresh_mgr)
|
||||
|
||||
from webapp.server import create_app
|
||||
return TestClient(create_app())
|
||||
|
||||
|
||||
def _minimal_pdf_dir(tmp_path: Path) -> Path:
|
||||
"""Create a temp directory that looks like a PDF folder (empty, valid dir)."""
|
||||
d = tmp_path / "pdfs"
|
||||
d.mkdir()
|
||||
return d
|
||||
|
||||
|
||||
def _mock_build_result(tmp_path: Path, job, run_id="r1"):
|
||||
"""Return a fake DatasetBuildResult with a minimal dataset CSV."""
|
||||
from rag_eval.dataset_builder.models import (
|
||||
DatasetBuildArtifactPaths,
|
||||
DatasetBuildResult,
|
||||
DraftQuestionSample,
|
||||
)
|
||||
|
||||
artifact_root = tmp_path / "build" / run_id
|
||||
artifact_root.mkdir(parents=True, exist_ok=True)
|
||||
latest = tmp_path / "build" / "latest"
|
||||
latest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chunks_path = artifact_root / "source_chunks.jsonl"
|
||||
chunks_path.write_text(
|
||||
json.dumps({"chunk_id": "c1", "doc_id": "d1", "doc_name": "test.pdf",
|
||||
"text": "CT scan context.", "page_start": 1, "page_end": 1,
|
||||
"section_path": "/", "section_title": "", "source_layout_ids": []}) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(latest / "source_chunks.jsonl").write_text(chunks_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
|
||||
dataset_csv = tmp_path / "generated_dataset.csv"
|
||||
dataset_csv.write_text(
|
||||
"sample_id,question,ground_truth,scenario,language,doc_id,doc_name,"
|
||||
"section_path,page_start,page_end,source_chunk_ids,question_type,difficulty,"
|
||||
"review_status,review_notes\n"
|
||||
's1,"What is CT?","CT is imaging.","test","zh","d1","test.pdf","/",'
|
||||
'1,1,"[""c1""]","fact","easy","draft",""\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
sample = DraftQuestionSample(
|
||||
sample_id="s1", question="What is CT?", ground_truth="CT is imaging.",
|
||||
scenario="test", language="zh", doc_id="d1", doc_name="test.pdf",
|
||||
section_path="/", page_start=1, page_end=1, source_chunk_ids=["c1"],
|
||||
question_type="fact", difficulty="easy",
|
||||
)
|
||||
|
||||
artifact_paths = DatasetBuildArtifactPaths(
|
||||
root_dir=artifact_root,
|
||||
documents_jsonl=artifact_root / "documents.jsonl",
|
||||
semantic_blocks_jsonl=artifact_root / "semantic_blocks.jsonl",
|
||||
source_chunks_jsonl=chunks_path,
|
||||
dataset_draft_csv=artifact_root / "dataset_draft.csv",
|
||||
parse_failures_csv=artifact_root / "parse_failures.csv",
|
||||
metadata_json=artifact_root / "metadata.json",
|
||||
)
|
||||
return DatasetBuildResult(
|
||||
job=job,
|
||||
run_id=run_id,
|
||||
artifact_paths=artifact_paths,
|
||||
documents=[],
|
||||
draft_samples=[sample],
|
||||
parse_failures=[],
|
||||
)
|
||||
|
||||
|
||||
def _mock_eval_result(tmp_path: Path, scenario):
|
||||
"""Return a fake EvaluationResult."""
|
||||
from rag_eval.shared.models import EvaluationResult
|
||||
|
||||
return EvaluationResult(
|
||||
scenario=scenario,
|
||||
run_id="eval-r1",
|
||||
started_at="2026-01-01T00:00:00",
|
||||
finished_at="2026-01-01T00:01:00",
|
||||
valid_samples=[],
|
||||
invalid_samples=[],
|
||||
score_rows=[],
|
||||
)
|
||||
|
||||
|
||||
# ── API route tests ────────────────────────────────────────────────────────────
|
||||
|
||||
def test_submit_returns_202_and_job_id(client, tmp_path):
|
||||
"""POST /api/pipeline/jobs returns 202 with job_id immediately."""
|
||||
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||
|
||||
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||
from webapp.models import PipelineResult
|
||||
mock_exec.return_value = PipelineResult(
|
||||
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||
source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
|
||||
parse_failures=0, eval_run_id="r1", eval_output_dir="/tmp/e",
|
||||
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||
)
|
||||
resp = client.post("/api/pipeline/jobs", json={
|
||||
"docs_path": str(pdf_dir),
|
||||
"job_name": "test-job",
|
||||
})
|
||||
|
||||
assert resp.status_code == 202
|
||||
data = resp.json()
|
||||
assert "job_id" in data
|
||||
assert data["job_name"] == "test-job"
|
||||
# status may already be completed by the time the response is read (mock runs instantly)
|
||||
assert data["status"] in ("queued", "completed")
|
||||
|
||||
|
||||
def test_get_nonexistent_job_returns_404(client):
|
||||
"""GET /api/pipeline/jobs/{id} returns 404 for unknown job."""
|
||||
resp = client.get("/api/pipeline/jobs/doesnotexist")
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_list_jobs_returns_empty_initially(client):
|
||||
"""GET /api/pipeline/jobs returns empty list when no jobs submitted."""
|
||||
resp = client.get("/api/pipeline/jobs")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["jobs"] == []
|
||||
|
||||
|
||||
def test_job_status_polling(client, tmp_path):
|
||||
"""Submitted job becomes visible via GET /api/pipeline/jobs/{id}."""
|
||||
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||
|
||||
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||
from webapp.models import PipelineResult
|
||||
mock_exec.return_value = PipelineResult(
|
||||
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||
source_chunks_jsonl="/tmp/c.jsonl", total_questions=3,
|
||||
parse_failures=0, eval_run_id="r2", eval_output_dir="/tmp/e",
|
||||
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||
)
|
||||
post_resp = client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir)})
|
||||
|
||||
job_id = post_resp.json()["job_id"]
|
||||
|
||||
# Poll until done or timeout (max 5s for mock)
|
||||
for _ in range(20):
|
||||
status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
|
||||
assert status_resp.status_code == 200
|
||||
status = status_resp.json()
|
||||
if status["status"] in ("completed", "failed"):
|
||||
break
|
||||
time.sleep(0.25)
|
||||
|
||||
assert status["status"] == "completed"
|
||||
assert status["result"]["total_questions"] == 3
|
||||
|
||||
|
||||
def test_job_fails_on_invalid_docs_path(client):
|
||||
"""Job fails quickly if docs_path does not exist."""
|
||||
resp = client.post("/api/pipeline/jobs", json={
|
||||
"docs_path": "/nonexistent/path/that/does/not/exist",
|
||||
})
|
||||
assert resp.status_code == 202
|
||||
job_id = resp.json()["job_id"]
|
||||
|
||||
for _ in range(20):
|
||||
status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
|
||||
status = status_resp.json()
|
||||
if status["status"] in ("completed", "failed"):
|
||||
break
|
||||
time.sleep(0.25)
|
||||
|
||||
assert status["status"] == "failed"
|
||||
assert "docs_path" in status["error"] or "not" in status["error"].lower()
|
||||
|
||||
|
||||
def test_list_jobs_shows_submitted(client, tmp_path):
|
||||
"""GET /api/pipeline/jobs includes jobs after submission."""
|
||||
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||
|
||||
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||
from webapp.models import PipelineResult
|
||||
mock_exec.return_value = PipelineResult(
|
||||
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||
source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
|
||||
parse_failures=0, eval_run_id="r3", eval_output_dir="/tmp/e",
|
||||
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||
)
|
||||
client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir), "job_name": "listed-job"})
|
||||
|
||||
time.sleep(0.5)
|
||||
list_resp = client.get("/api/pipeline/jobs")
|
||||
assert list_resp.status_code == 200
|
||||
jobs = list_resp.json()["jobs"]
|
||||
assert len(jobs) >= 1
|
||||
names = [j["job_name"] for j in jobs]
|
||||
assert "listed-job" in names
|
||||
|
||||
|
||||
# ── execute_dataset_build_job refactor test ────────────────────────────────────
|
||||
|
||||
def test_execute_dataset_build_job_directly(tmp_path):
|
||||
"""execute_dataset_build_job runs the build without a YAML file."""
|
||||
from unittest.mock import patch as _patch
|
||||
from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
|
||||
from rag_eval.dataset_builder.runner import execute_dataset_build_job
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
|
||||
pdf_dir = tmp_path / "pdfs"
|
||||
pdf_dir.mkdir()
|
||||
(pdf_dir / "doc.pdf").write_bytes(b"%PDF-fake")
|
||||
|
||||
job = DatasetBuildJob(
|
||||
job_name="direct-test",
|
||||
input_path=pdf_dir,
|
||||
input_glob="*.pdf",
|
||||
parser_provider="aliyun_docmind",
|
||||
failure_mode="skip",
|
||||
generation_model="test-model",
|
||||
output_type="online_question_bank",
|
||||
review_mode="draft_with_manual_review",
|
||||
max_questions_per_document=5,
|
||||
max_source_chunks_per_question=3,
|
||||
dataset_path=tmp_path / "out.csv",
|
||||
artifact_dir=tmp_path / "artifacts",
|
||||
runtime=DatasetBuildRuntime(max_documents=1),
|
||||
)
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.doc_id = "d1"
|
||||
mock_doc.doc_name = "doc.pdf"
|
||||
mock_doc.source_chunks = []
|
||||
mock_doc.semantic_blocks = []
|
||||
mock_doc.raw_text = ""
|
||||
mock_doc.structure_nodes = []
|
||||
mock_doc.metadata = {}
|
||||
mock_doc.to_record.return_value = {
|
||||
"doc_id": "d1", "doc_name": "doc.pdf", "raw_text": "",
|
||||
"structure_nodes": [], "metadata": {},
|
||||
"semantic_block_count": 0, "source_chunk_count": 0,
|
||||
}
|
||||
|
||||
mock_parser = MagicMock()
|
||||
mock_parser.parse.return_value = mock_doc
|
||||
|
||||
mock_generator = MagicMock()
|
||||
mock_generator.generate.return_value = []
|
||||
|
||||
result = execute_dataset_build_job(
|
||||
job,
|
||||
settings=EvaluationSettings(_env_file=None),
|
||||
parser=mock_parser,
|
||||
generator=mock_generator,
|
||||
)
|
||||
assert result.job.job_name == "direct-test"
|
||||
assert result.artifact_paths.root_dir.exists()
|
||||
@@ -65,7 +65,8 @@ def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path:
|
||||
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
||||
"context_recall": pytest.approx(0.5, rel=1e-4),
|
||||
}
|
||||
assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
|
||||
# 综合加权得分已暂时禁用
|
||||
assert report.weighted_score_mean is None
|
||||
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||
assert report.summary_markdown == "summary"
|
||||
|
||||
@@ -241,7 +241,8 @@ class TestScoreEndpoint:
|
||||
})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["weighted_score"] is not None
|
||||
# 综合加权得分已暂时禁用,始终返回 null
|
||||
assert data["weighted_score"] is None
|
||||
|
||||
def test_missing_required_fields_returns_422(self, client):
|
||||
resp = client.post("/api/score", json={"question": "q"})
|
||||
|
||||
299
tests/webapp/test_session_score_jobs_api.py
Normal file
299
tests/webapp/test_session_score_jobs_api.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""Tests for session-grouped async scoring API and SessionScoreJobManager."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture()
|
||||
def tmp_manager(tmp_path):
|
||||
"""Isolated SessionScoreJobManager backed by tmp dirs (no real LLM calls)."""
|
||||
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||
return SessionScoreJobManager(
|
||||
output_dir=tmp_path / "score-session",
|
||||
index_dir=tmp_path / "score-session-jobs",
|
||||
max_workers=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def client(tmp_path, monkeypatch):
|
||||
"""TestClient with fresh SessionScoreJobManager backed by tmp dirs."""
|
||||
import webapp.services.session_score_manager as mgr_mod
|
||||
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||
|
||||
fresh_mgr = SessionScoreJobManager(
|
||||
output_dir=tmp_path / "score-session",
|
||||
index_dir=tmp_path / "score-session-jobs",
|
||||
max_workers=2,
|
||||
)
|
||||
monkeypatch.setattr(mgr_mod, "session_score_manager", fresh_mgr)
|
||||
|
||||
import webapp.api.session_score_jobs as api_mod
|
||||
monkeypatch.setattr(api_mod, "session_score_manager", fresh_mgr)
|
||||
|
||||
from webapp.server import create_app
|
||||
return pytest.importorskip("fastapi.testclient").TestClient(create_app())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests for SessionScoreJobManager
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSessionRunId:
|
||||
def test_same_session_always_same_run_id(self, tmp_manager):
|
||||
assert tmp_manager.session_run_id("abc") == tmp_manager.session_run_id("abc")
|
||||
|
||||
def test_different_sessions_different_run_ids(self, tmp_manager):
|
||||
assert tmp_manager.session_run_id("session-A") != tmp_manager.session_run_id("session-B")
|
||||
|
||||
def test_run_id_prefixed_with_session(self, tmp_manager):
|
||||
assert tmp_manager.session_run_id("test123").startswith("session-")
|
||||
|
||||
def test_special_chars_sanitized(self, tmp_manager):
|
||||
run_id = tmp_manager.session_run_id("user@dify:flow/001")
|
||||
assert "/" not in run_id
|
||||
assert "@" not in run_id
|
||||
assert ":" not in run_id
|
||||
|
||||
|
||||
class TestSubmit:
|
||||
def test_submit_returns_job_status_and_run_id(self, tmp_manager):
|
||||
with patch.object(tmp_manager._executor, "submit"):
|
||||
status, run_id = tmp_manager.submit("session-1", _mock_request())
|
||||
assert status.job_id
|
||||
assert status.status == "queued"
|
||||
assert run_id == tmp_manager.session_run_id("session-1")
|
||||
|
||||
def test_submit_adds_job_to_session(self, tmp_manager):
|
||||
with patch.object(tmp_manager._executor, "submit"):
|
||||
status, _ = tmp_manager.submit("session-1", _mock_request())
|
||||
session = tmp_manager.get_session("session-1")
|
||||
assert session is not None
|
||||
assert any(j.job_id == status.job_id for j in session.jobs)
|
||||
|
||||
def test_multiple_submits_same_session_accumulate(self, tmp_manager):
|
||||
with patch.object(tmp_manager._executor, "submit"):
|
||||
tmp_manager.submit("session-X", _mock_request())
|
||||
tmp_manager.submit("session-X", _mock_request())
|
||||
tmp_manager.submit("session-X", _mock_request())
|
||||
session = tmp_manager.get_session("session-X")
|
||||
assert session.call_count == 3
|
||||
|
||||
def test_get_unknown_job_returns_none(self, tmp_manager):
|
||||
assert tmp_manager.get_job("does-not-exist") is None
|
||||
|
||||
def test_get_unknown_session_returns_none(self, tmp_manager):
|
||||
assert tmp_manager.get_session("no-such-session") is None
|
||||
|
||||
|
||||
class TestSessionIndexPersistence:
|
||||
def test_session_index_survives_restart(self, tmp_path):
|
||||
"""Jobs and session mappings loaded from disk on new manager instance."""
|
||||
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||
|
||||
mgr1 = SessionScoreJobManager(
|
||||
output_dir=tmp_path / "score-session",
|
||||
index_dir=tmp_path / "score-session-jobs",
|
||||
)
|
||||
with patch.object(mgr1._executor, "submit"):
|
||||
mgr1.submit("persist-session", _mock_request())
|
||||
mgr1.submit("persist-session", _mock_request())
|
||||
|
||||
# New manager instance loads from disk
|
||||
mgr2 = SessionScoreJobManager(
|
||||
output_dir=tmp_path / "score-session",
|
||||
index_dir=tmp_path / "score-session-jobs",
|
||||
)
|
||||
session = mgr2.get_session("persist-session")
|
||||
assert session is not None
|
||||
assert session.call_count == 2
|
||||
|
||||
def test_job_index_file_created_on_submit(self, tmp_path):
|
||||
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||
mgr = SessionScoreJobManager(
|
||||
output_dir=tmp_path / "score-session",
|
||||
index_dir=tmp_path / "score-session-jobs",
|
||||
)
|
||||
with patch.object(mgr._executor, "submit"):
|
||||
status, _ = mgr.submit("file-test", _mock_request())
|
||||
index_file = tmp_path / "score-session-jobs" / f"{status.job_id}.json"
|
||||
assert index_file.is_file()
|
||||
data = json.loads(index_file.read_text())
|
||||
assert data["job_id"] == status.job_id
|
||||
|
||||
|
||||
class TestAppendBehaviour:
|
||||
"""Test the CSV append / read-all logic in _append_and_regenerate via _read_score_rows."""
|
||||
|
||||
def test_read_score_rows_returns_empty_for_missing_csv(self, tmp_manager, tmp_path):
|
||||
rows = tmp_manager._read_score_rows(tmp_path / "nonexistent")
|
||||
assert rows == []
|
||||
|
||||
def test_read_score_rows_reads_existing_csv(self, tmp_manager, tmp_path):
|
||||
run_dir = tmp_path / "run1"
|
||||
run_dir.mkdir()
|
||||
df = pd.DataFrame([{"sample_id": "s1", "answer_relevancy": 0.9}])
|
||||
df.to_csv(run_dir / "scores.csv", index=False)
|
||||
rows = tmp_manager._read_score_rows(run_dir)
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["sample_id"] == "s1"
|
||||
|
||||
def test_metric_means_computed_from_csv(self, tmp_manager, tmp_path):
|
||||
run_dir = tmp_path / "run2"
|
||||
run_dir.mkdir()
|
||||
df = pd.DataFrame([
|
||||
{"sample_id": "s1", "answer_relevancy": 0.8},
|
||||
{"sample_id": "s2", "answer_relevancy": 0.6},
|
||||
])
|
||||
df.to_csv(run_dir / "scores.csv", index=False)
|
||||
means = tmp_manager._read_metric_means(run_dir)
|
||||
assert means["answer_relevancy"] == pytest.approx(0.7, abs=1e-4)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API endpoint tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSessionAsyncEndpoints:
|
||||
def test_submit_returns_202_with_session_fields(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
resp = client.post("/api/score/session_async", json={
|
||||
"session_id": "test-session-001",
|
||||
"question": "What is CT?",
|
||||
"answer": "CT is computed tomography.",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
assert resp.status_code == 202
|
||||
data = resp.json()
|
||||
assert data["session_id"] == "test-session-001"
|
||||
assert "job_id" in data
|
||||
assert "run_id" in data
|
||||
assert data["status"] == "queued"
|
||||
assert data["call_count"] >= 1
|
||||
|
||||
def test_run_id_deterministic_for_session(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
r1 = client.post("/api/score/session_async", json={
|
||||
"session_id": "det-session",
|
||||
"question": "Q1",
|
||||
"answer": "A1",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
r2 = client.post("/api/score/session_async", json={
|
||||
"session_id": "det-session",
|
||||
"question": "Q2",
|
||||
"answer": "A2",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
assert r1.json()["run_id"] == r2.json()["run_id"]
|
||||
|
||||
def test_different_sessions_different_run_ids(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
r1 = client.post("/api/score/session_async", json={
|
||||
"session_id": "session-A",
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
r2 = client.post("/api/score/session_async", json={
|
||||
"session_id": "session-B",
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
assert r1.json()["run_id"] != r2.json()["run_id"]
|
||||
|
||||
def test_call_count_increments_per_session(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
for _ in range(3):
|
||||
client.post("/api/score/session_async", json={
|
||||
"session_id": "count-session",
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
time.sleep(0.05)
|
||||
resp = client.get("/api/score/sessions/count-session")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["call_count"] == 3
|
||||
|
||||
def test_get_session_returns_jobs_list(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
client.post("/api/score/session_async", json={
|
||||
"session_id": "list-session",
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
time.sleep(0.05)
|
||||
resp = client.get("/api/score/sessions/list-session")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert len(data["jobs"]) == 1
|
||||
|
||||
def test_get_unknown_session_returns_404(self, client):
|
||||
resp = client.get("/api/score/sessions/no-such-session-xyz")
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_get_session_job_by_id(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
resp = client.post("/api/score/session_async", json={
|
||||
"session_id": "job-lookup-session",
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
job_id = resp.json()["job_id"]
|
||||
time.sleep(0.05)
|
||||
get_resp = client.get(f"/api/score/session/jobs/{job_id}")
|
||||
assert get_resp.status_code == 200
|
||||
assert get_resp.json()["job_id"] == job_id
|
||||
|
||||
def test_get_unknown_job_returns_404(self, client):
|
||||
resp = client.get("/api/score/session/jobs/nonexistent-job-id")
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_missing_session_id_returns_422(self, client):
|
||||
resp = client.post("/api/score/session_async", json={
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
assert resp.status_code == 422
|
||||
|
||||
def test_list_sessions_endpoint(self, client):
|
||||
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||
client.post("/api/score/session_async", json={
|
||||
"session_id": "list-all-session",
|
||||
"question": "Q",
|
||||
"answer": "A",
|
||||
"metrics": ["answer_relevancy"],
|
||||
})
|
||||
resp = client.get("/api/score/sessions")
|
||||
assert resp.status_code == 200
|
||||
assert "sessions" in resp.json()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _mock_request():
|
||||
"""Build a minimal ScoreRequest for testing."""
|
||||
from webapp.models import ScoreRequest
|
||||
return ScoreRequest(
|
||||
question="What is dual-source CT?",
|
||||
answer="It uses two X-ray sources.",
|
||||
metrics=["answer_relevancy"],
|
||||
)
|
||||
Reference in New Issue
Block a user