Files
siemens_ragas/rag_eval/dataset_builder/schema.py
2026-06-12 14:02:15 +08:00

83 lines
2.4 KiB
Python

"""Pydantic schemas for dataset build YAML configuration files."""
from __future__ import annotations
from pathlib import Path
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field, model_validator
class DatasetBuildInputModel(BaseModel):
"""Schema for input PDF discovery settings."""
model_config = ConfigDict(extra="ignore")
path: str
glob: str = "*.pdf"
class DatasetBuildParserModel(BaseModel):
"""Schema for parser selection and failure handling."""
model_config = ConfigDict(extra="ignore")
provider: Literal["aliyun_docmind"]
failure_mode: Literal["fail", "skip"] | None = None
class DatasetBuildGenerationModel(BaseModel):
"""Schema for question generation controls."""
model_config = ConfigDict(extra="ignore")
model: str | None = None
output_type: Literal["online_question_bank"]
review_mode: Literal["draft_with_manual_review"]
max_questions_per_document: int = Field(default=10, gt=0)
max_source_chunks_per_question: int = Field(default=3, gt=0)
class DatasetBuildOutputModel(BaseModel):
"""Schema for dataset build output locations."""
model_config = ConfigDict(extra="ignore")
dataset_path: str
artifact_dir: str
class DatasetBuildRuntimeModel(BaseModel):
"""Schema for runtime throttling and document limits."""
model_config = ConfigDict(extra="ignore")
max_documents: int | None = Field(default=None, gt=0)
class DatasetBuildConfigModel(BaseModel):
"""Top-level schema for a dataset build job."""
model_config = ConfigDict(extra="ignore")
job_name: str
input: DatasetBuildInputModel
parser: DatasetBuildParserModel
generation: DatasetBuildGenerationModel
output: DatasetBuildOutputModel
runtime: DatasetBuildRuntimeModel = Field(default_factory=DatasetBuildRuntimeModel)
@model_validator(mode="after")
def validate_job_name(self) -> "DatasetBuildConfigModel":
"""Reject blank job names that would break artifact paths."""
if not self.job_name.strip():
raise ValueError("job_name must not be empty.")
return self
def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
"""Resolve relative paths against the config file directory."""
candidate = Path(raw_path)
if candidate.is_absolute():
return candidate
return (base_dir / candidate).resolve()