feat(dataset-builder): add retry logic and ASCII-safe logging for Siemens PDF pipeline

- question_generator.py: add max_retries=3/retry_delay=5s loop with
  exponential backoff on LLM timeout or server errors; encode filenames
  with ascii/replace before printing to avoid UnicodeEncodeError on
  Windows cp1252 consoles
- runner.py: encode PDF filenames ASCII-safe for progress messages;
  catch generation failures per-document and skip (or re-raise) based
  on failure_mode, preventing one bad doc from aborting the whole build

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 23:06:33 +08:00
parent 75ae7927ad
commit 1ff4a3943a
2 changed files with 60 additions and 24 deletions

View File

@@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
import json import json
import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any from typing import Any
@@ -150,24 +151,39 @@ class OpenAIQuestionGenerator(QuestionGenerator):
max_questions: int, max_questions: int,
max_chunks_per_question: int, max_chunks_per_question: int,
job_name: str, job_name: str,
max_retries: int = 3,
retry_delay: float = 5.0,
) -> list[DraftQuestionSample]: ) -> list[DraftQuestionSample]:
"""Generate draft questions for one parsed document.""" """Generate draft questions for one parsed document, with retry on timeout/server errors."""
prompt = self._build_prompt( prompt = self._build_prompt(
document, document,
max_questions=max_questions, max_questions=max_questions,
max_chunks_per_question=max_chunks_per_question, max_chunks_per_question=max_chunks_per_question,
) )
response = self.client.chat.completions.create( last_exc: Exception | None = None
model=self.model, for attempt in range(1, max_retries + 1):
messages=[ try:
{"role": "system", "content": "You generate structured draft question banks from source documents."}, response = self.client.chat.completions.create(
{"role": "user", "content": prompt}, model=self.model,
], messages=[
response_format={"type": "json_object"}, {"role": "system", "content": "You generate structured draft question banks from source documents."},
) {"role": "user", "content": prompt},
content = response.choices[0].message.content or "{}" ],
payload = self._parse_response_payload(content) response_format={"type": "json_object"},
return [ )
self._build_sample(document=document, payload=item, index=index, job_name=job_name) content = response.choices[0].message.content or "{}"
for index, item in enumerate(payload[:max_questions], start=1) payload = self._parse_response_payload(content)
] return [
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
for index, item in enumerate(payload[:max_questions], start=1)
]
except Exception as exc:
last_exc = exc
if attempt < max_retries:
wait = retry_delay * attempt
doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
print(f" [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
time.sleep(wait)
raise RuntimeError(
f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
) from last_exc

View File

@@ -111,12 +111,32 @@ def run_dataset_build(
continue continue
documents.append(document) documents.append(document)
generated = generator.generate( doc_name_safe = pdf_path.name.encode("ascii", "replace").decode("ascii")
document, print(f" [info] generating questions for: {doc_name_safe}")
max_questions=job.max_questions_per_document, try:
max_chunks_per_question=job.max_source_chunks_per_question, generated = generator.generate(
job_name=job.job_name, document,
) max_questions=job.max_questions_per_document,
max_chunks_per_question=job.max_source_chunks_per_question,
job_name=job.job_name,
)
except Exception as exc:
gen_failure = ParseFailure(file_path=pdf_path.as_posix(), error=f"generation failed: {exc}")
failures.append(gen_failure)
print(f" [warn] skipping {doc_name_safe} after generation failure: {exc}")
if job.failure_mode == "fail":
result = DatasetBuildResult(
job=job,
run_id=run_id,
artifact_paths=artifact_paths,
documents=documents,
draft_samples=draft_samples,
parse_failures=failures,
)
write_dataset_build_artifacts(result)
raise
continue
valid_generated = [] valid_generated = []
for sample in generated: for sample in generated:
errors = validate_draft_sample( errors = validate_draft_sample(
@@ -126,9 +146,9 @@ def run_dataset_build(
) )
if not errors: if not errors:
valid_generated.append(sample) valid_generated.append(sample)
draft_samples.extend( new_samples = dedupe_samples(valid_generated)[: job.max_questions_per_document]
dedupe_samples(valid_generated)[: job.max_questions_per_document] draft_samples.extend(new_samples)
) print(f" [info] {doc_name_safe}: {len(new_samples)} questions generated (total so far: {len(draft_samples)})")
result = DatasetBuildResult( result = DatasetBuildResult(
job=job, job=job,