first commit
This commit is contained in:
82
rag_eval/dataset_builder/schema.py
Normal file
82
rag_eval/dataset_builder/schema.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Pydantic schemas for dataset build YAML configuration files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
|
||||
|
||||
class DatasetBuildInputModel(BaseModel):
|
||||
"""Schema for input PDF discovery settings."""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
path: str
|
||||
glob: str = "*.pdf"
|
||||
|
||||
|
||||
class DatasetBuildParserModel(BaseModel):
|
||||
"""Schema for parser selection and failure handling."""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
provider: Literal["aliyun_docmind"]
|
||||
failure_mode: Literal["fail", "skip"] | None = None
|
||||
|
||||
|
||||
class DatasetBuildGenerationModel(BaseModel):
|
||||
"""Schema for question generation controls."""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
model: str | None = None
|
||||
output_type: Literal["online_question_bank"]
|
||||
review_mode: Literal["draft_with_manual_review"]
|
||||
max_questions_per_document: int = Field(default=10, gt=0)
|
||||
max_source_chunks_per_question: int = Field(default=3, gt=0)
|
||||
|
||||
|
||||
class DatasetBuildOutputModel(BaseModel):
|
||||
"""Schema for dataset build output locations."""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
dataset_path: str
|
||||
artifact_dir: str
|
||||
|
||||
|
||||
class DatasetBuildRuntimeModel(BaseModel):
|
||||
"""Schema for runtime throttling and document limits."""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
max_documents: int | None = Field(default=None, gt=0)
|
||||
|
||||
|
||||
class DatasetBuildConfigModel(BaseModel):
|
||||
"""Top-level schema for a dataset build job."""
|
||||
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
job_name: str
|
||||
input: DatasetBuildInputModel
|
||||
parser: DatasetBuildParserModel
|
||||
generation: DatasetBuildGenerationModel
|
||||
output: DatasetBuildOutputModel
|
||||
runtime: DatasetBuildRuntimeModel = Field(default_factory=DatasetBuildRuntimeModel)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_job_name(self) -> "DatasetBuildConfigModel":
|
||||
"""Reject blank job names that would break artifact paths."""
|
||||
if not self.job_name.strip():
|
||||
raise ValueError("job_name must not be empty.")
|
||||
return self
|
||||
|
||||
def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
|
||||
"""Resolve relative paths against the config file directory."""
|
||||
candidate = Path(raw_path)
|
||||
if candidate.is_absolute():
|
||||
return candidate
|
||||
return (base_dir / candidate).resolve()
|
||||
Reference in New Issue
Block a user