204 lines
5.6 KiB
Python
204 lines
5.6 KiB
Python
"""Internal data models for the PDF-to-dataset build workflow."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import asdict, dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Literal
|
|
|
|
|
|
ReviewStatus = Literal["draft", "approved", "rejected", "needs_edit"]
|
|
QuestionType = Literal["fact", "summary", "procedure", "comparison"]
|
|
Difficulty = Literal["easy", "medium", "hard"]
|
|
FailureMode = Literal["fail", "skip"]
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class DatasetBuildRuntime:
|
|
"""Runtime controls for one dataset build job."""
|
|
|
|
max_documents: int | None = None
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class DatasetBuildJob:
|
|
"""Resolved dataset build configuration consumed by the build runner."""
|
|
|
|
job_name: str
|
|
input_path: Path
|
|
input_glob: str
|
|
parser_provider: str
|
|
failure_mode: FailureMode
|
|
generation_model: str
|
|
output_type: str
|
|
review_mode: str
|
|
max_questions_per_document: int
|
|
max_source_chunks_per_question: int
|
|
dataset_path: Path
|
|
artifact_dir: Path
|
|
runtime: DatasetBuildRuntime = field(default_factory=DatasetBuildRuntime)
|
|
source_path: Path | None = None
|
|
|
|
def snapshot(self) -> dict[str, Any]:
|
|
"""Serialize the job into JSON-friendly metadata."""
|
|
payload = asdict(self)
|
|
payload["input_path"] = self.input_path.as_posix()
|
|
payload["dataset_path"] = self.dataset_path.as_posix()
|
|
payload["artifact_dir"] = self.artifact_dir.as_posix()
|
|
if self.source_path is not None:
|
|
payload["source_path"] = self.source_path.as_posix()
|
|
return payload
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class StructureNode:
|
|
"""One normalized structure heading extracted from layout results."""
|
|
|
|
node_id: str
|
|
level: int
|
|
title: str
|
|
page_start: int
|
|
page_end: int
|
|
section_path: str
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class SemanticBlock:
|
|
"""One merged semantic block used as an intermediate artifact before chunking."""
|
|
|
|
block_id: str
|
|
doc_id: str
|
|
doc_name: str
|
|
text: str
|
|
page_start: int
|
|
page_end: int
|
|
section_path: str
|
|
section_title: str
|
|
source_layout_ids: list[str]
|
|
|
|
def to_record(self) -> dict[str, Any]:
|
|
"""Convert the block into a flat artifact record."""
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class SourceChunk:
|
|
"""Evidence chunk used for question generation and human review."""
|
|
|
|
chunk_id: str
|
|
doc_id: str
|
|
doc_name: str
|
|
text: str
|
|
page_start: int
|
|
page_end: int
|
|
section_path: str
|
|
section_title: str
|
|
source_layout_ids: list[str]
|
|
|
|
def to_record(self) -> dict[str, Any]:
|
|
"""Convert the chunk into a flat artifact record."""
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class ParsedDocument:
|
|
"""Normalized parsed document ready for question generation."""
|
|
|
|
doc_id: str
|
|
doc_name: str
|
|
raw_text: str
|
|
structure_nodes: list[StructureNode]
|
|
semantic_blocks: list[SemanticBlock]
|
|
source_chunks: list[SourceChunk]
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_record(self) -> dict[str, Any]:
|
|
"""Convert the parsed document into a summary artifact record."""
|
|
return {
|
|
"doc_id": self.doc_id,
|
|
"doc_name": self.doc_name,
|
|
"raw_text": self.raw_text,
|
|
"structure_nodes": [asdict(item) for item in self.structure_nodes],
|
|
"metadata": self.metadata,
|
|
"semantic_block_count": len(self.semantic_blocks),
|
|
"source_chunk_count": len(self.source_chunks),
|
|
}
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class DraftQuestionSample:
|
|
"""One draft online evaluation sample pending manual review."""
|
|
|
|
sample_id: str
|
|
question: str
|
|
ground_truth: str
|
|
scenario: str
|
|
language: str
|
|
doc_id: str
|
|
doc_name: str
|
|
section_path: str
|
|
page_start: int
|
|
page_end: int
|
|
source_chunk_ids: list[str]
|
|
question_type: QuestionType
|
|
difficulty: Difficulty
|
|
review_status: ReviewStatus = "draft"
|
|
review_notes: str = ""
|
|
|
|
def to_record(self) -> dict[str, Any]:
|
|
"""Convert the draft sample into a flat CSV row."""
|
|
return {
|
|
"sample_id": self.sample_id,
|
|
"question": self.question,
|
|
"ground_truth": self.ground_truth,
|
|
"scenario": self.scenario,
|
|
"language": self.language,
|
|
"doc_id": self.doc_id,
|
|
"doc_name": self.doc_name,
|
|
"section_path": self.section_path,
|
|
"page_start": self.page_start,
|
|
"page_end": self.page_end,
|
|
"source_chunk_ids": self.source_chunk_ids,
|
|
"question_type": self.question_type,
|
|
"difficulty": self.difficulty,
|
|
"review_status": self.review_status,
|
|
"review_notes": self.review_notes,
|
|
}
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class ParseFailure:
|
|
"""One document parse failure recorded for reporting and skip-mode execution."""
|
|
|
|
file_path: str
|
|
error: str
|
|
|
|
def to_record(self) -> dict[str, str]:
|
|
"""Convert the failure into a flat CSV row."""
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class DatasetBuildArtifactPaths:
|
|
"""Canonical file paths produced by one dataset build run."""
|
|
|
|
root_dir: Path
|
|
documents_jsonl: Path
|
|
semantic_blocks_jsonl: Path
|
|
source_chunks_jsonl: Path
|
|
dataset_draft_csv: Path
|
|
parse_failures_csv: Path
|
|
metadata_json: Path
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class DatasetBuildResult:
|
|
"""Aggregate result object returned after a dataset build completes."""
|
|
|
|
job: DatasetBuildJob
|
|
run_id: str
|
|
artifact_paths: DatasetBuildArtifactPaths
|
|
documents: list[ParsedDocument]
|
|
draft_samples: list[DraftQuestionSample]
|
|
parse_failures: list[ParseFailure]
|