feat(dataset-builder): add retry logic and ASCII-safe logging for Siemens PDF pipeline
- question_generator.py: add max_retries=3/retry_delay=5s loop with exponential backoff on LLM timeout or server errors; encode filenames with ascii/replace before printing to avoid UnicodeEncodeError on Windows cp1252 consoles - runner.py: encode PDF filenames ASCII-safe for progress messages; catch generation failures per-document and skip (or re-raise) based on failure_mode, preventing one bad doc from aborting the whole build Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
@@ -150,24 +151,39 @@ class OpenAIQuestionGenerator(QuestionGenerator):
|
||||
max_questions: int,
|
||||
max_chunks_per_question: int,
|
||||
job_name: str,
|
||||
max_retries: int = 3,
|
||||
retry_delay: float = 5.0,
|
||||
) -> list[DraftQuestionSample]:
|
||||
"""Generate draft questions for one parsed document."""
|
||||
"""Generate draft questions for one parsed document, with retry on timeout/server errors."""
|
||||
prompt = self._build_prompt(
|
||||
document,
|
||||
max_questions=max_questions,
|
||||
max_chunks_per_question=max_chunks_per_question,
|
||||
)
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You generate structured draft question banks from source documents."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
content = response.choices[0].message.content or "{}"
|
||||
payload = self._parse_response_payload(content)
|
||||
return [
|
||||
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
|
||||
for index, item in enumerate(payload[:max_questions], start=1)
|
||||
]
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You generate structured draft question banks from source documents."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
content = response.choices[0].message.content or "{}"
|
||||
payload = self._parse_response_payload(content)
|
||||
return [
|
||||
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
|
||||
for index, item in enumerate(payload[:max_questions], start=1)
|
||||
]
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
if attempt < max_retries:
|
||||
wait = retry_delay * attempt
|
||||
doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
|
||||
print(f" [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
|
||||
time.sleep(wait)
|
||||
raise RuntimeError(
|
||||
f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
|
||||
) from last_exc
|
||||
|
||||
Reference in New Issue
Block a user