first commit

2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions
--- a/rag_eval/dataset_builder/models.py
+++ b/rag_eval/dataset_builder/models.py
@@ -0,0 +1,203 @@
+"""Internal data models for the PDF-to-dataset build workflow."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+
+
+ReviewStatus = Literal["draft", "approved", "rejected", "needs_edit"]
+QuestionType = Literal["fact", "summary", "procedure", "comparison"]
+Difficulty = Literal["easy", "medium", "hard"]
+FailureMode = Literal["fail", "skip"]
+
+
+@dataclass(slots=True)
+class DatasetBuildRuntime:
+    """Runtime controls for one dataset build job."""
+
+    max_documents: int | None = None
+
+
+@dataclass(slots=True)
+class DatasetBuildJob:
+    """Resolved dataset build configuration consumed by the build runner."""
+
+    job_name: str
+    input_path: Path
+    input_glob: str
+    parser_provider: str
+    failure_mode: FailureMode
+    generation_model: str
+    output_type: str
+    review_mode: str
+    max_questions_per_document: int
+    max_source_chunks_per_question: int
+    dataset_path: Path
+    artifact_dir: Path
+    runtime: DatasetBuildRuntime = field(default_factory=DatasetBuildRuntime)
+    source_path: Path | None = None
+
+    def snapshot(self) -> dict[str, Any]:
+        """Serialize the job into JSON-friendly metadata."""
+        payload = asdict(self)
+        payload["input_path"] = self.input_path.as_posix()
+        payload["dataset_path"] = self.dataset_path.as_posix()
+        payload["artifact_dir"] = self.artifact_dir.as_posix()
+        if self.source_path is not None:
+            payload["source_path"] = self.source_path.as_posix()
+        return payload
+
+
+@dataclass(slots=True)
+class StructureNode:
+    """One normalized structure heading extracted from layout results."""
+
+    node_id: str
+    level: int
+    title: str
+    page_start: int
+    page_end: int
+    section_path: str
+
+
+@dataclass(slots=True)
+class SemanticBlock:
+    """One merged semantic block used as an intermediate artifact before chunking."""
+
+    block_id: str
+    doc_id: str
+    doc_name: str
+    text: str
+    page_start: int
+    page_end: int
+    section_path: str
+    section_title: str
+    source_layout_ids: list[str]
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the block into a flat artifact record."""
+        return asdict(self)
+
+
+@dataclass(slots=True)
+class SourceChunk:
+    """Evidence chunk used for question generation and human review."""
+
+    chunk_id: str
+    doc_id: str
+    doc_name: str
+    text: str
+    page_start: int
+    page_end: int
+    section_path: str
+    section_title: str
+    source_layout_ids: list[str]
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the chunk into a flat artifact record."""
+        return asdict(self)
+
+
+@dataclass(slots=True)
+class ParsedDocument:
+    """Normalized parsed document ready for question generation."""
+
+    doc_id: str
+    doc_name: str
+    raw_text: str
+    structure_nodes: list[StructureNode]
+    semantic_blocks: list[SemanticBlock]
+    source_chunks: list[SourceChunk]
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the parsed document into a summary artifact record."""
+        return {
+            "doc_id": self.doc_id,
+            "doc_name": self.doc_name,
+            "raw_text": self.raw_text,
+            "structure_nodes": [asdict(item) for item in self.structure_nodes],
+            "metadata": self.metadata,
+            "semantic_block_count": len(self.semantic_blocks),
+            "source_chunk_count": len(self.source_chunks),
+        }
+
+
+@dataclass(slots=True)
+class DraftQuestionSample:
+    """One draft online evaluation sample pending manual review."""
+
+    sample_id: str
+    question: str
+    ground_truth: str
+    scenario: str
+    language: str
+    doc_id: str
+    doc_name: str
+    section_path: str
+    page_start: int
+    page_end: int
+    source_chunk_ids: list[str]
+    question_type: QuestionType
+    difficulty: Difficulty
+    review_status: ReviewStatus = "draft"
+    review_notes: str = ""
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the draft sample into a flat CSV row."""
+        return {
+            "sample_id": self.sample_id,
+            "question": self.question,
+            "ground_truth": self.ground_truth,
+            "scenario": self.scenario,
+            "language": self.language,
+            "doc_id": self.doc_id,
+            "doc_name": self.doc_name,
+            "section_path": self.section_path,
+            "page_start": self.page_start,
+            "page_end": self.page_end,
+            "source_chunk_ids": self.source_chunk_ids,
+            "question_type": self.question_type,
+            "difficulty": self.difficulty,
+            "review_status": self.review_status,
+            "review_notes": self.review_notes,
+        }
+
+
+@dataclass(slots=True)
+class ParseFailure:
+    """One document parse failure recorded for reporting and skip-mode execution."""
+
+    file_path: str
+    error: str
+
+    def to_record(self) -> dict[str, str]:
+        """Convert the failure into a flat CSV row."""
+        return asdict(self)
+
+
+@dataclass(slots=True)
+class DatasetBuildArtifactPaths:
+    """Canonical file paths produced by one dataset build run."""
+
+    root_dir: Path
+    documents_jsonl: Path
+    semantic_blocks_jsonl: Path
+    source_chunks_jsonl: Path
+    dataset_draft_csv: Path
+    parse_failures_csv: Path
+    metadata_json: Path
+
+
+@dataclass(slots=True)
+class DatasetBuildResult:
+    """Aggregate result object returned after a dataset build completes."""
+
+    job: DatasetBuildJob
+    run_id: str
+    artifact_paths: DatasetBuildArtifactPaths
+    documents: list[ParsedDocument]
+    draft_samples: list[DraftQuestionSample]
+    parse_failures: list[ParseFailure]