first commit
This commit is contained in:
203
rag_eval/dataset_builder/models.py
Normal file
203
rag_eval/dataset_builder/models.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""Internal data models for the PDF-to-dataset build workflow."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
|
||||
ReviewStatus = Literal["draft", "approved", "rejected", "needs_edit"]
|
||||
QuestionType = Literal["fact", "summary", "procedure", "comparison"]
|
||||
Difficulty = Literal["easy", "medium", "hard"]
|
||||
FailureMode = Literal["fail", "skip"]
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DatasetBuildRuntime:
|
||||
"""Runtime controls for one dataset build job."""
|
||||
|
||||
max_documents: int | None = None
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DatasetBuildJob:
|
||||
"""Resolved dataset build configuration consumed by the build runner."""
|
||||
|
||||
job_name: str
|
||||
input_path: Path
|
||||
input_glob: str
|
||||
parser_provider: str
|
||||
failure_mode: FailureMode
|
||||
generation_model: str
|
||||
output_type: str
|
||||
review_mode: str
|
||||
max_questions_per_document: int
|
||||
max_source_chunks_per_question: int
|
||||
dataset_path: Path
|
||||
artifact_dir: Path
|
||||
runtime: DatasetBuildRuntime = field(default_factory=DatasetBuildRuntime)
|
||||
source_path: Path | None = None
|
||||
|
||||
def snapshot(self) -> dict[str, Any]:
|
||||
"""Serialize the job into JSON-friendly metadata."""
|
||||
payload = asdict(self)
|
||||
payload["input_path"] = self.input_path.as_posix()
|
||||
payload["dataset_path"] = self.dataset_path.as_posix()
|
||||
payload["artifact_dir"] = self.artifact_dir.as_posix()
|
||||
if self.source_path is not None:
|
||||
payload["source_path"] = self.source_path.as_posix()
|
||||
return payload
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class StructureNode:
|
||||
"""One normalized structure heading extracted from layout results."""
|
||||
|
||||
node_id: str
|
||||
level: int
|
||||
title: str
|
||||
page_start: int
|
||||
page_end: int
|
||||
section_path: str
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SemanticBlock:
|
||||
"""One merged semantic block used as an intermediate artifact before chunking."""
|
||||
|
||||
block_id: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
text: str
|
||||
page_start: int
|
||||
page_end: int
|
||||
section_path: str
|
||||
section_title: str
|
||||
source_layout_ids: list[str]
|
||||
|
||||
def to_record(self) -> dict[str, Any]:
|
||||
"""Convert the block into a flat artifact record."""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SourceChunk:
|
||||
"""Evidence chunk used for question generation and human review."""
|
||||
|
||||
chunk_id: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
text: str
|
||||
page_start: int
|
||||
page_end: int
|
||||
section_path: str
|
||||
section_title: str
|
||||
source_layout_ids: list[str]
|
||||
|
||||
def to_record(self) -> dict[str, Any]:
|
||||
"""Convert the chunk into a flat artifact record."""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ParsedDocument:
|
||||
"""Normalized parsed document ready for question generation."""
|
||||
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
raw_text: str
|
||||
structure_nodes: list[StructureNode]
|
||||
semantic_blocks: list[SemanticBlock]
|
||||
source_chunks: list[SourceChunk]
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_record(self) -> dict[str, Any]:
|
||||
"""Convert the parsed document into a summary artifact record."""
|
||||
return {
|
||||
"doc_id": self.doc_id,
|
||||
"doc_name": self.doc_name,
|
||||
"raw_text": self.raw_text,
|
||||
"structure_nodes": [asdict(item) for item in self.structure_nodes],
|
||||
"metadata": self.metadata,
|
||||
"semantic_block_count": len(self.semantic_blocks),
|
||||
"source_chunk_count": len(self.source_chunks),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DraftQuestionSample:
|
||||
"""One draft online evaluation sample pending manual review."""
|
||||
|
||||
sample_id: str
|
||||
question: str
|
||||
ground_truth: str
|
||||
scenario: str
|
||||
language: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
section_path: str
|
||||
page_start: int
|
||||
page_end: int
|
||||
source_chunk_ids: list[str]
|
||||
question_type: QuestionType
|
||||
difficulty: Difficulty
|
||||
review_status: ReviewStatus = "draft"
|
||||
review_notes: str = ""
|
||||
|
||||
def to_record(self) -> dict[str, Any]:
|
||||
"""Convert the draft sample into a flat CSV row."""
|
||||
return {
|
||||
"sample_id": self.sample_id,
|
||||
"question": self.question,
|
||||
"ground_truth": self.ground_truth,
|
||||
"scenario": self.scenario,
|
||||
"language": self.language,
|
||||
"doc_id": self.doc_id,
|
||||
"doc_name": self.doc_name,
|
||||
"section_path": self.section_path,
|
||||
"page_start": self.page_start,
|
||||
"page_end": self.page_end,
|
||||
"source_chunk_ids": self.source_chunk_ids,
|
||||
"question_type": self.question_type,
|
||||
"difficulty": self.difficulty,
|
||||
"review_status": self.review_status,
|
||||
"review_notes": self.review_notes,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ParseFailure:
|
||||
"""One document parse failure recorded for reporting and skip-mode execution."""
|
||||
|
||||
file_path: str
|
||||
error: str
|
||||
|
||||
def to_record(self) -> dict[str, str]:
|
||||
"""Convert the failure into a flat CSV row."""
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DatasetBuildArtifactPaths:
|
||||
"""Canonical file paths produced by one dataset build run."""
|
||||
|
||||
root_dir: Path
|
||||
documents_jsonl: Path
|
||||
semantic_blocks_jsonl: Path
|
||||
source_chunks_jsonl: Path
|
||||
dataset_draft_csv: Path
|
||||
parse_failures_csv: Path
|
||||
metadata_json: Path
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DatasetBuildResult:
|
||||
"""Aggregate result object returned after a dataset build completes."""
|
||||
|
||||
job: DatasetBuildJob
|
||||
run_id: str
|
||||
artifact_paths: DatasetBuildArtifactPaths
|
||||
documents: list[ParsedDocument]
|
||||
draft_samples: list[DraftQuestionSample]
|
||||
parse_failures: list[ParseFailure]
|
||||
Reference in New Issue
Block a user