feat(dataset-builder): add retry logic and ASCII-safe logging for Siemens PDF pipeline

- question_generator.py: add max_retries=3/retry_delay=5s loop with exponential backoff on LLM timeout or server errors; encode filenames with ascii/replace before printing to avoid UnicodeEncodeError on Windows cp1252 consoles - runner.py: encode PDF filenames ASCII-safe for progress messages; catch generation failures per-document and skip (or re-raise) based on failure_mode, preventing one bad doc from aborting the whole build Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-06-15 23:06:33 +08:00
parent 75ae7927ad
commit 1ff4a3943a
2 changed files with 60 additions and 24 deletions
--- a/rag_eval/dataset_builder/generator/question_generator.py
+++ b/rag_eval/dataset_builder/generator/question_generator.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 import json
 import time
 from abc import ABC, abstractmethod
 from typing import Any
@@ -150,13 +151,18 @@ class OpenAIQuestionGenerator(QuestionGenerator):
        max_questions: int,
        max_chunks_per_question: int,
        job_name: str,
        max_retries: int = 3,
        retry_delay: float = 5.0,
    ) -> list[DraftQuestionSample]:
-        """Generate draft questions for one parsed document."""
+        """Generate draft questions for one parsed document, with retry on timeout/server errors."""
        prompt = self._build_prompt(
            document,
            max_questions=max_questions,
            max_chunks_per_question=max_chunks_per_question,
        )
        last_exc: Exception | None = None
        for attempt in range(1, max_retries + 1):
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
@@ -171,3 +177,13 @@ class OpenAIQuestionGenerator(QuestionGenerator):
                    self._build_sample(document=document, payload=item, index=index, job_name=job_name)
                    for index, item in enumerate(payload[:max_questions], start=1)
                ]
            except Exception as exc:
                last_exc = exc
                if attempt < max_retries:
                    wait = retry_delay * attempt
                    doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
                    print(f"  [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
                    time.sleep(wait)
        raise RuntimeError(
            f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
        ) from last_exc
--- a/rag_eval/dataset_builder/runner.py
+++ b/rag_eval/dataset_builder/runner.py
@@ -111,12 +111,32 @@ def run_dataset_build(
            continue
        documents.append(document)
        doc_name_safe = pdf_path.name.encode("ascii", "replace").decode("ascii")
        print(f"  [info] generating questions for: {doc_name_safe}")
        try:
            generated = generator.generate(
                document,
                max_questions=job.max_questions_per_document,
                max_chunks_per_question=job.max_source_chunks_per_question,
                job_name=job.job_name,
            )
        except Exception as exc:
            gen_failure = ParseFailure(file_path=pdf_path.as_posix(), error=f"generation failed: {exc}")
            failures.append(gen_failure)
            print(f"  [warn] skipping {doc_name_safe} after generation failure: {exc}")
            if job.failure_mode == "fail":
                result = DatasetBuildResult(
                    job=job,
                    run_id=run_id,
                    artifact_paths=artifact_paths,
                    documents=documents,
                    draft_samples=draft_samples,
                    parse_failures=failures,
                )
                write_dataset_build_artifacts(result)
                raise
            continue
        valid_generated = []
        for sample in generated:
            errors = validate_draft_sample(
@@ -126,9 +146,9 @@ def run_dataset_build(
            )
            if not errors:
                valid_generated.append(sample)
-        draft_samples.extend(
+        new_samples = dedupe_samples(valid_generated)[: job.max_questions_per_document]
-            dedupe_samples(valid_generated)[: job.max_questions_per_document]
+        draft_samples.extend(new_samples)
-        )
+        print(f"  [info] {doc_name_safe}: {len(new_samples)} questions generated (total so far: {len(draft_samples)})")
    result = DatasetBuildResult(
        job=job,