first commit
This commit is contained in:
21
rag_eval/dataset_builder/sources.py
Normal file
21
rag_eval/dataset_builder/sources.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Input source discovery helpers for dataset build jobs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def discover_pdf_files(input_path: Path, pattern: str = "*.pdf") -> list[Path]:
|
||||
"""Return all PDF files from a single file path or a directory scan."""
|
||||
if not input_path.exists():
|
||||
raise FileNotFoundError(f"Input path does not exist: {input_path}")
|
||||
|
||||
if input_path.is_file():
|
||||
if input_path.suffix.lower() != ".pdf":
|
||||
raise ValueError(f"Input file is not a PDF: {input_path}")
|
||||
return [input_path]
|
||||
|
||||
files = sorted(path for path in input_path.glob(pattern) if path.is_file() and path.suffix.lower() == ".pdf")
|
||||
if not files:
|
||||
raise ValueError(f"No PDF files found under {input_path} with pattern {pattern}")
|
||||
return files
|
||||
Reference in New Issue
Block a user