first commit
This commit is contained in:
142
rag_eval/dataset_builder/runner.py
Normal file
142
rag_eval/dataset_builder/runner.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""Orchestration layer for PDF-to-dataset build jobs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
from rag_eval.shared.utils import ensure_directory, utc_now_iso
|
||||
|
||||
from .generator.question_generator import OpenAIQuestionGenerator, QuestionGenerator
|
||||
from .generator.validators import dedupe_samples, validate_draft_sample
|
||||
from .models import DatasetBuildJob, DatasetBuildResult, DatasetBuildRuntime, ParseFailure
|
||||
from .parser.aliyun_document_parser import AliyunDocumentParser
|
||||
from .parser.aliyun_docmind_gateway import AliyunDocmindGateway
|
||||
from .schema import DatasetBuildConfigModel
|
||||
from .sources import discover_pdf_files
|
||||
from .writers import build_artifact_paths, write_dataset_build_artifacts
|
||||
|
||||
|
||||
def load_dataset_build_job(path: str | Path, settings: EvaluationSettings | None = None) -> DatasetBuildJob:
|
||||
"""Load and validate a dataset build YAML file."""
|
||||
settings = settings or EvaluationSettings()
|
||||
config_path = Path(path).resolve()
|
||||
payload = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||||
model = DatasetBuildConfigModel.model_validate(payload)
|
||||
base_dir = config_path.parent
|
||||
|
||||
generation_model = (
|
||||
model.generation.model
|
||||
or settings.dataset_generator_model
|
||||
or "qwen3.6-plus"
|
||||
)
|
||||
parser_payload = payload.get("parser") or {}
|
||||
failure_mode = parser_payload.get("failure_mode") or settings.parser_failure_mode or "fail"
|
||||
return DatasetBuildJob(
|
||||
job_name=model.job_name,
|
||||
input_path=model.resolve_path(base_dir, model.input.path),
|
||||
input_glob=model.input.glob,
|
||||
parser_provider=model.parser.provider,
|
||||
failure_mode=failure_mode,
|
||||
generation_model=generation_model,
|
||||
output_type=model.generation.output_type,
|
||||
review_mode=model.generation.review_mode,
|
||||
max_questions_per_document=model.generation.max_questions_per_document,
|
||||
max_source_chunks_per_question=model.generation.max_source_chunks_per_question,
|
||||
dataset_path=model.resolve_path(base_dir, model.output.dataset_path),
|
||||
artifact_dir=model.resolve_path(base_dir, model.output.artifact_dir),
|
||||
runtime=DatasetBuildRuntime(max_documents=model.runtime.max_documents),
|
||||
source_path=config_path,
|
||||
)
|
||||
|
||||
|
||||
def _create_parser(job: DatasetBuildJob, settings: EvaluationSettings) -> AliyunDocumentParser:
|
||||
"""Create the configured document parser implementation."""
|
||||
if job.parser_provider != "aliyun_docmind":
|
||||
raise ValueError(f"Unsupported parser provider: {job.parser_provider}")
|
||||
gateway = AliyunDocmindGateway(settings)
|
||||
return AliyunDocumentParser(gateway)
|
||||
|
||||
|
||||
def _create_generator(job: DatasetBuildJob, settings: EvaluationSettings) -> QuestionGenerator:
|
||||
"""Create the configured draft question generator implementation."""
|
||||
return OpenAIQuestionGenerator(settings=settings, model=job.generation_model)
|
||||
|
||||
|
||||
def run_dataset_build(
|
||||
config_path: str | Path,
|
||||
*,
|
||||
settings: EvaluationSettings | None = None,
|
||||
parser: AliyunDocumentParser | None = None,
|
||||
generator: QuestionGenerator | None = None,
|
||||
) -> DatasetBuildResult:
|
||||
"""Run one dataset build job end to end and persist all required artifacts."""
|
||||
settings = settings or EvaluationSettings()
|
||||
job = load_dataset_build_job(config_path, settings=settings)
|
||||
pdf_files = discover_pdf_files(job.input_path, job.input_glob)
|
||||
if job.runtime.max_documents is not None:
|
||||
pdf_files = pdf_files[: job.runtime.max_documents]
|
||||
|
||||
parser = parser or _create_parser(job, settings)
|
||||
generator = generator or _create_generator(job, settings)
|
||||
|
||||
run_id = utc_now_iso().replace(":", "-")
|
||||
artifact_root = job.artifact_dir / run_id
|
||||
ensure_directory(artifact_root)
|
||||
artifact_paths = build_artifact_paths(artifact_root)
|
||||
|
||||
documents = []
|
||||
failures: list[ParseFailure] = []
|
||||
draft_samples = []
|
||||
|
||||
for pdf_path in pdf_files:
|
||||
try:
|
||||
document = parser.parse(pdf_path)
|
||||
except Exception as exc:
|
||||
failure = ParseFailure(file_path=pdf_path.as_posix(), error=str(exc))
|
||||
failures.append(failure)
|
||||
if job.failure_mode == "fail":
|
||||
result = DatasetBuildResult(
|
||||
job=job,
|
||||
run_id=run_id,
|
||||
artifact_paths=artifact_paths,
|
||||
documents=documents,
|
||||
draft_samples=draft_samples,
|
||||
parse_failures=failures,
|
||||
)
|
||||
write_dataset_build_artifacts(result)
|
||||
raise
|
||||
continue
|
||||
|
||||
documents.append(document)
|
||||
generated = generator.generate(
|
||||
document,
|
||||
max_questions=job.max_questions_per_document,
|
||||
max_chunks_per_question=job.max_source_chunks_per_question,
|
||||
job_name=job.job_name,
|
||||
)
|
||||
valid_generated = []
|
||||
for sample in generated:
|
||||
errors = validate_draft_sample(
|
||||
sample,
|
||||
document=document,
|
||||
max_source_chunks_per_question=job.max_source_chunks_per_question,
|
||||
)
|
||||
if not errors:
|
||||
valid_generated.append(sample)
|
||||
draft_samples.extend(
|
||||
dedupe_samples(valid_generated)[: job.max_questions_per_document]
|
||||
)
|
||||
|
||||
result = DatasetBuildResult(
|
||||
job=job,
|
||||
run_id=run_id,
|
||||
artifact_paths=artifact_paths,
|
||||
documents=documents,
|
||||
draft_samples=draft_samples,
|
||||
parse_failures=failures,
|
||||
)
|
||||
write_dataset_build_artifacts(result)
|
||||
return result
|
||||
Reference in New Issue
Block a user