feat(dataset-builder): add retry logic and ASCII-safe logging for Siemens PDF pipeline

- question_generator.py: add max_retries=3/retry_delay=5s loop with exponential backoff on LLM timeout or server errors; encode filenames with ascii/replace before printing to avoid UnicodeEncodeError on Windows cp1252 consoles - runner.py: encode PDF filenames ASCII-safe for progress messages; catch generation failures per-document and skip (or re-raise) based on failure_mode, preventing one bad doc from aborting the whole build Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-06-15 23:06:33 +08:00
parent 75ae7927ad
commit 1ff4a3943a
2 changed files with 60 additions and 24 deletions
--- a/rag_eval/dataset_builder/generator/question_generator.py
+++ b/rag_eval/dataset_builder/generator/question_generator.py
@@ -3,6 +3,7 @@
 from __future__ import annotations

 import json
+import time
 from abc import ABC, abstractmethod
 from typing import Any

@@ -150,24 +151,39 @@ class OpenAIQuestionGenerator(QuestionGenerator):
        max_questions: int,
        max_chunks_per_question: int,
        job_name: str,
+        max_retries: int = 3,
+        retry_delay: float = 5.0,
    ) -> list[DraftQuestionSample]:
-        """Generate draft questions for one parsed document."""
+        """Generate draft questions for one parsed document, with retry on timeout/server errors."""
        prompt = self._build_prompt(
            document,
            max_questions=max_questions,
            max_chunks_per_question=max_chunks_per_question,
        )
-        response = self.client.chat.completions.create(
-            model=self.model,
-            messages=[
-                {"role": "system", "content": "You generate structured draft question banks from source documents."},
-                {"role": "user", "content": prompt},
-            ],
-            response_format={"type": "json_object"},
-        )
-        content = response.choices[0].message.content or "{}"
-        payload = self._parse_response_payload(content)
-        return [
-            self._build_sample(document=document, payload=item, index=index, job_name=job_name)
-            for index, item in enumerate(payload[:max_questions], start=1)
-        ]
+        last_exc: Exception | None = None
+        for attempt in range(1, max_retries + 1):
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": "You generate structured draft question banks from source documents."},
+                        {"role": "user", "content": prompt},
+                    ],
+                    response_format={"type": "json_object"},
+                )
+                content = response.choices[0].message.content or "{}"
+                payload = self._parse_response_payload(content)
+                return [
+                    self._build_sample(document=document, payload=item, index=index, job_name=job_name)
+                    for index, item in enumerate(payload[:max_questions], start=1)
+                ]
+            except Exception as exc:
+                last_exc = exc
+                if attempt < max_retries:
+                    wait = retry_delay * attempt
+                    doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
+                    print(f"  [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
+                    time.sleep(wait)
+        raise RuntimeError(
+            f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
+        ) from last_exc