83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
"""Pydantic schemas for dataset build YAML configuration files."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
|
|
|
|
class DatasetBuildInputModel(BaseModel):
|
|
"""Schema for input PDF discovery settings."""
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
path: str
|
|
glob: str = "*.pdf"
|
|
|
|
|
|
class DatasetBuildParserModel(BaseModel):
|
|
"""Schema for parser selection and failure handling."""
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
provider: Literal["aliyun_docmind"]
|
|
failure_mode: Literal["fail", "skip"] | None = None
|
|
|
|
|
|
class DatasetBuildGenerationModel(BaseModel):
|
|
"""Schema for question generation controls."""
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
model: str | None = None
|
|
output_type: Literal["online_question_bank"]
|
|
review_mode: Literal["draft_with_manual_review"]
|
|
max_questions_per_document: int = Field(default=10, gt=0)
|
|
max_source_chunks_per_question: int = Field(default=3, gt=0)
|
|
|
|
|
|
class DatasetBuildOutputModel(BaseModel):
|
|
"""Schema for dataset build output locations."""
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
dataset_path: str
|
|
artifact_dir: str
|
|
|
|
|
|
class DatasetBuildRuntimeModel(BaseModel):
|
|
"""Schema for runtime throttling and document limits."""
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
max_documents: int | None = Field(default=None, gt=0)
|
|
|
|
|
|
class DatasetBuildConfigModel(BaseModel):
|
|
"""Top-level schema for a dataset build job."""
|
|
|
|
model_config = ConfigDict(extra="ignore")
|
|
|
|
job_name: str
|
|
input: DatasetBuildInputModel
|
|
parser: DatasetBuildParserModel
|
|
generation: DatasetBuildGenerationModel
|
|
output: DatasetBuildOutputModel
|
|
runtime: DatasetBuildRuntimeModel = Field(default_factory=DatasetBuildRuntimeModel)
|
|
|
|
@model_validator(mode="after")
|
|
def validate_job_name(self) -> "DatasetBuildConfigModel":
|
|
"""Reject blank job names that would break artifact paths."""
|
|
if not self.job_name.strip():
|
|
raise ValueError("job_name must not be empty.")
|
|
return self
|
|
|
|
def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
|
|
"""Resolve relative paths against the config file directory."""
|
|
candidate = Path(raw_path)
|
|
if candidate.is_absolute():
|
|
return candidate
|
|
return (base_dir / candidate).resolve()
|