"""Tests for the end-to-end pipeline API and pipeline task manager.""" from __future__ import annotations import json import time from pathlib import Path from unittest.mock import MagicMock, patch import pytest from fastapi.testclient import TestClient # ── fixtures ────────────────────────────────────────────────────────────────── @pytest.fixture() def client(tmp_path, monkeypatch): """TestClient with a fresh PipelineTaskManager backed by tmp_path outputs.""" import webapp.services.pipeline_task_manager as mgr_mod from webapp.services.pipeline_task_manager import PipelineTaskManager fresh_mgr = PipelineTaskManager(max_workers=2) monkeypatch.setattr(mgr_mod, "pipeline_task_manager", fresh_mgr) monkeypatch.setattr(mgr_mod, "_PIPELINE_OUTPUT_ROOT", tmp_path / "pipeline") import webapp.api.pipeline as api_mod monkeypatch.setattr(api_mod, "pipeline_task_manager", fresh_mgr) from webapp.server import create_app return TestClient(create_app()) def _minimal_pdf_dir(tmp_path: Path) -> Path: """Create a temp directory that looks like a PDF folder (empty, valid dir).""" d = tmp_path / "pdfs" d.mkdir() return d def _mock_build_result(tmp_path: Path, job, run_id="r1"): """Return a fake DatasetBuildResult with a minimal dataset CSV.""" from rag_eval.dataset_builder.models import ( DatasetBuildArtifactPaths, DatasetBuildResult, DraftQuestionSample, ) artifact_root = tmp_path / "build" / run_id artifact_root.mkdir(parents=True, exist_ok=True) latest = tmp_path / "build" / "latest" latest.mkdir(parents=True, exist_ok=True) chunks_path = artifact_root / "source_chunks.jsonl" chunks_path.write_text( json.dumps({"chunk_id": "c1", "doc_id": "d1", "doc_name": "test.pdf", "text": "CT scan context.", "page_start": 1, "page_end": 1, "section_path": "/", "section_title": "", "source_layout_ids": []}) + "\n", encoding="utf-8", ) (latest / "source_chunks.jsonl").write_text(chunks_path.read_text(encoding="utf-8"), encoding="utf-8") dataset_csv = tmp_path / "generated_dataset.csv" dataset_csv.write_text( "sample_id,question,ground_truth,scenario,language,doc_id,doc_name," "section_path,page_start,page_end,source_chunk_ids,question_type,difficulty," "review_status,review_notes\n" 's1,"What is CT?","CT is imaging.","test","zh","d1","test.pdf","/",' '1,1,"[""c1""]","fact","easy","draft",""\n', encoding="utf-8", ) sample = DraftQuestionSample( sample_id="s1", question="What is CT?", ground_truth="CT is imaging.", scenario="test", language="zh", doc_id="d1", doc_name="test.pdf", section_path="/", page_start=1, page_end=1, source_chunk_ids=["c1"], question_type="fact", difficulty="easy", ) artifact_paths = DatasetBuildArtifactPaths( root_dir=artifact_root, documents_jsonl=artifact_root / "documents.jsonl", semantic_blocks_jsonl=artifact_root / "semantic_blocks.jsonl", source_chunks_jsonl=chunks_path, dataset_draft_csv=artifact_root / "dataset_draft.csv", parse_failures_csv=artifact_root / "parse_failures.csv", metadata_json=artifact_root / "metadata.json", ) return DatasetBuildResult( job=job, run_id=run_id, artifact_paths=artifact_paths, documents=[], draft_samples=[sample], parse_failures=[], ) def _mock_eval_result(tmp_path: Path, scenario): """Return a fake EvaluationResult.""" from rag_eval.shared.models import EvaluationResult return EvaluationResult( scenario=scenario, run_id="eval-r1", started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00", valid_samples=[], invalid_samples=[], score_rows=[], ) # ── API route tests ──────────────────────────────────────────────────────────── def test_submit_returns_202_and_job_id(client, tmp_path): """POST /api/pipeline/jobs returns 202 with job_id immediately.""" pdf_dir = _minimal_pdf_dir(tmp_path) with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec: from webapp.models import PipelineResult mock_exec.return_value = PipelineResult( build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv", source_chunks_jsonl="/tmp/c.jsonl", total_questions=1, parse_failures=0, eval_run_id="r1", eval_output_dir="/tmp/e", scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md", ) resp = client.post("/api/pipeline/jobs", json={ "docs_path": str(pdf_dir), "job_name": "test-job", }) assert resp.status_code == 202 data = resp.json() assert "job_id" in data assert data["job_name"] == "test-job" # status may already be completed by the time the response is read (mock runs instantly) assert data["status"] in ("queued", "completed") def test_get_nonexistent_job_returns_404(client): """GET /api/pipeline/jobs/{id} returns 404 for unknown job.""" resp = client.get("/api/pipeline/jobs/doesnotexist") assert resp.status_code == 404 def test_list_jobs_returns_empty_initially(client): """GET /api/pipeline/jobs returns empty list when no jobs submitted.""" resp = client.get("/api/pipeline/jobs") assert resp.status_code == 200 assert resp.json()["jobs"] == [] def test_job_status_polling(client, tmp_path): """Submitted job becomes visible via GET /api/pipeline/jobs/{id}.""" pdf_dir = _minimal_pdf_dir(tmp_path) with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec: from webapp.models import PipelineResult mock_exec.return_value = PipelineResult( build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv", source_chunks_jsonl="/tmp/c.jsonl", total_questions=3, parse_failures=0, eval_run_id="r2", eval_output_dir="/tmp/e", scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md", ) post_resp = client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir)}) job_id = post_resp.json()["job_id"] # Poll until done or timeout (max 5s for mock) for _ in range(20): status_resp = client.get(f"/api/pipeline/jobs/{job_id}") assert status_resp.status_code == 200 status = status_resp.json() if status["status"] in ("completed", "failed"): break time.sleep(0.25) assert status["status"] == "completed" assert status["result"]["total_questions"] == 3 def test_job_fails_on_invalid_docs_path(client): """Job fails quickly if docs_path does not exist.""" resp = client.post("/api/pipeline/jobs", json={ "docs_path": "/nonexistent/path/that/does/not/exist", }) assert resp.status_code == 202 job_id = resp.json()["job_id"] for _ in range(20): status_resp = client.get(f"/api/pipeline/jobs/{job_id}") status = status_resp.json() if status["status"] in ("completed", "failed"): break time.sleep(0.25) assert status["status"] == "failed" assert "docs_path" in status["error"] or "not" in status["error"].lower() def test_list_jobs_shows_submitted(client, tmp_path): """GET /api/pipeline/jobs includes jobs after submission.""" pdf_dir = _minimal_pdf_dir(tmp_path) with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec: from webapp.models import PipelineResult mock_exec.return_value = PipelineResult( build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv", source_chunks_jsonl="/tmp/c.jsonl", total_questions=1, parse_failures=0, eval_run_id="r3", eval_output_dir="/tmp/e", scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md", ) client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir), "job_name": "listed-job"}) time.sleep(0.5) list_resp = client.get("/api/pipeline/jobs") assert list_resp.status_code == 200 jobs = list_resp.json()["jobs"] assert len(jobs) >= 1 names = [j["job_name"] for j in jobs] assert "listed-job" in names # ── execute_dataset_build_job refactor test ──────────────────────────────────── def test_execute_dataset_build_job_directly(tmp_path): """execute_dataset_build_job runs the build without a YAML file.""" from unittest.mock import patch as _patch from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime from rag_eval.dataset_builder.runner import execute_dataset_build_job from rag_eval.settings import EvaluationSettings pdf_dir = tmp_path / "pdfs" pdf_dir.mkdir() (pdf_dir / "doc.pdf").write_bytes(b"%PDF-fake") job = DatasetBuildJob( job_name="direct-test", input_path=pdf_dir, input_glob="*.pdf", parser_provider="aliyun_docmind", failure_mode="skip", generation_model="test-model", output_type="online_question_bank", review_mode="draft_with_manual_review", max_questions_per_document=5, max_source_chunks_per_question=3, dataset_path=tmp_path / "out.csv", artifact_dir=tmp_path / "artifacts", runtime=DatasetBuildRuntime(max_documents=1), ) mock_doc = MagicMock() mock_doc.doc_id = "d1" mock_doc.doc_name = "doc.pdf" mock_doc.source_chunks = [] mock_doc.semantic_blocks = [] mock_doc.raw_text = "" mock_doc.structure_nodes = [] mock_doc.metadata = {} mock_doc.to_record.return_value = { "doc_id": "d1", "doc_name": "doc.pdf", "raw_text": "", "structure_nodes": [], "metadata": {}, "semantic_block_count": 0, "source_chunk_count": 0, } mock_parser = MagicMock() mock_parser.parse.return_value = mock_doc mock_generator = MagicMock() mock_generator.generate.return_value = [] result = execute_dataset_build_job( job, settings=EvaluationSettings(_env_file=None), parser=mock_parser, generator=mock_generator, ) assert result.job.job_name == "direct-test" assert result.artifact_paths.root_dir.exists()