feat(dataset-builder): add retry logic and ASCII-safe logging for Siemens PDF pipeline
- question_generator.py: add max_retries=3/retry_delay=5s loop with exponential backoff on LLM timeout or server errors; encode filenames with ascii/replace before printing to avoid UnicodeEncodeError on Windows cp1252 consoles - runner.py: encode PDF filenames ASCII-safe for progress messages; catch generation failures per-document and skip (or re-raise) based on failure_mode, preventing one bad doc from aborting the whole build Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
@@ -111,12 +111,32 @@ def run_dataset_build(
|
||||
continue
|
||||
|
||||
documents.append(document)
|
||||
generated = generator.generate(
|
||||
document,
|
||||
max_questions=job.max_questions_per_document,
|
||||
max_chunks_per_question=job.max_source_chunks_per_question,
|
||||
job_name=job.job_name,
|
||||
)
|
||||
doc_name_safe = pdf_path.name.encode("ascii", "replace").decode("ascii")
|
||||
print(f" [info] generating questions for: {doc_name_safe}")
|
||||
try:
|
||||
generated = generator.generate(
|
||||
document,
|
||||
max_questions=job.max_questions_per_document,
|
||||
max_chunks_per_question=job.max_source_chunks_per_question,
|
||||
job_name=job.job_name,
|
||||
)
|
||||
except Exception as exc:
|
||||
gen_failure = ParseFailure(file_path=pdf_path.as_posix(), error=f"generation failed: {exc}")
|
||||
failures.append(gen_failure)
|
||||
print(f" [warn] skipping {doc_name_safe} after generation failure: {exc}")
|
||||
if job.failure_mode == "fail":
|
||||
result = DatasetBuildResult(
|
||||
job=job,
|
||||
run_id=run_id,
|
||||
artifact_paths=artifact_paths,
|
||||
documents=documents,
|
||||
draft_samples=draft_samples,
|
||||
parse_failures=failures,
|
||||
)
|
||||
write_dataset_build_artifacts(result)
|
||||
raise
|
||||
continue
|
||||
|
||||
valid_generated = []
|
||||
for sample in generated:
|
||||
errors = validate_draft_sample(
|
||||
@@ -126,9 +146,9 @@ def run_dataset_build(
|
||||
)
|
||||
if not errors:
|
||||
valid_generated.append(sample)
|
||||
draft_samples.extend(
|
||||
dedupe_samples(valid_generated)[: job.max_questions_per_document]
|
||||
)
|
||||
new_samples = dedupe_samples(valid_generated)[: job.max_questions_per_document]
|
||||
draft_samples.extend(new_samples)
|
||||
print(f" [info] {doc_name_safe}: {len(new_samples)} questions generated (total so far: {len(draft_samples)})")
|
||||
|
||||
result = DatasetBuildResult(
|
||||
job=job,
|
||||
|
||||
Reference in New Issue
Block a user