siemens_ragas/rag_eval/dataset_builder/models.py

"""Internal data models for the PDF-to-dataset build workflow."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Literal


ReviewStatus = Literal["draft", "approved", "rejected", "needs_edit"]
QuestionType = Literal["fact", "summary", "procedure", "comparison"]
Difficulty = Literal["easy", "medium", "hard"]
FailureMode = Literal["fail", "skip"]


@dataclass(slots=True)
class DatasetBuildRuntime:
    """Runtime controls for one dataset build job."""

    max_documents: int | None = None


@dataclass(slots=True)
class DatasetBuildJob:
    """Resolved dataset build configuration consumed by the build runner."""

    job_name: str
    input_path: Path
    input_glob: str
    parser_provider: str
    failure_mode: FailureMode
    generation_model: str
    output_type: str
    review_mode: str
    max_questions_per_document: int
    max_source_chunks_per_question: int
    dataset_path: Path
    artifact_dir: Path
    runtime: DatasetBuildRuntime = field(default_factory=DatasetBuildRuntime)
    source_path: Path | None = None

    def snapshot(self) -> dict[str, Any]:
        """Serialize the job into JSON-friendly metadata."""
        payload = asdict(self)
        payload["input_path"] = self.input_path.as_posix()
        payload["dataset_path"] = self.dataset_path.as_posix()
        payload["artifact_dir"] = self.artifact_dir.as_posix()
        if self.source_path is not None:
            payload["source_path"] = self.source_path.as_posix()
        return payload


@dataclass(slots=True)
class StructureNode:
    """One normalized structure heading extracted from layout results."""

    node_id: str
    level: int
    title: str
    page_start: int
    page_end: int
    section_path: str


@dataclass(slots=True)
class SemanticBlock:
    """One merged semantic block used as an intermediate artifact before chunking."""

    block_id: str
    doc_id: str
    doc_name: str
    text: str
    page_start: int
    page_end: int
    section_path: str
    section_title: str
    source_layout_ids: list[str]

    def to_record(self) -> dict[str, Any]:
        """Convert the block into a flat artifact record."""
        return asdict(self)


@dataclass(slots=True)
class SourceChunk:
    """Evidence chunk used for question generation and human review."""

    chunk_id: str
    doc_id: str
    doc_name: str
    text: str
    page_start: int
    page_end: int
    section_path: str
    section_title: str
    source_layout_ids: list[str]

    def to_record(self) -> dict[str, Any]:
        """Convert the chunk into a flat artifact record."""
        return asdict(self)


@dataclass(slots=True)
class ParsedDocument:
    """Normalized parsed document ready for question generation."""

    doc_id: str
    doc_name: str
    raw_text: str
    structure_nodes: list[StructureNode]
    semantic_blocks: list[SemanticBlock]
    source_chunks: list[SourceChunk]
    metadata: dict[str, Any] = field(default_factory=dict)

    def to_record(self) -> dict[str, Any]:
        """Convert the parsed document into a summary artifact record."""
        return {
            "doc_id": self.doc_id,
            "doc_name": self.doc_name,
            "raw_text": self.raw_text,
            "structure_nodes": [asdict(item) for item in self.structure_nodes],
            "metadata": self.metadata,
            "semantic_block_count": len(self.semantic_blocks),
            "source_chunk_count": len(self.source_chunks),
        }


@dataclass(slots=True)
class DraftQuestionSample:
    """One draft online evaluation sample pending manual review."""

    sample_id: str
    question: str
    ground_truth: str
    scenario: str
    language: str
    doc_id: str
    doc_name: str
    section_path: str
    page_start: int
    page_end: int
    source_chunk_ids: list[str]
    question_type: QuestionType
    difficulty: Difficulty
    review_status: ReviewStatus = "draft"
    review_notes: str = ""

    def to_record(self) -> dict[str, Any]:
        """Convert the draft sample into a flat CSV row."""
        return {
            "sample_id": self.sample_id,
            "question": self.question,
            "ground_truth": self.ground_truth,
            "scenario": self.scenario,
            "language": self.language,
            "doc_id": self.doc_id,
            "doc_name": self.doc_name,
            "section_path": self.section_path,
            "page_start": self.page_start,
            "page_end": self.page_end,
            "source_chunk_ids": self.source_chunk_ids,
            "question_type": self.question_type,
            "difficulty": self.difficulty,
            "review_status": self.review_status,
            "review_notes": self.review_notes,
        }


@dataclass(slots=True)
class ParseFailure:
    """One document parse failure recorded for reporting and skip-mode execution."""

    file_path: str
    error: str

    def to_record(self) -> dict[str, str]:
        """Convert the failure into a flat CSV row."""
        return asdict(self)


@dataclass(slots=True)
class DatasetBuildArtifactPaths:
    """Canonical file paths produced by one dataset build run."""

    root_dir: Path
    documents_jsonl: Path
    semantic_blocks_jsonl: Path
    source_chunks_jsonl: Path
    dataset_draft_csv: Path
    parse_failures_csv: Path
    metadata_json: Path


@dataclass(slots=True)
class DatasetBuildResult:
    """Aggregate result object returned after a dataset build completes."""

    job: DatasetBuildJob
    run_id: str
    artifact_paths: DatasetBuildArtifactPaths
    documents: list[ParsedDocument]
    draft_samples: list[DraftQuestionSample]
    parse_failures: list[ParseFailure]