39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
|
|
"""Document parser that normalizes Alibaba layout results into internal models."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from rag_eval.dataset_builder.models import ParsedDocument
|
||
|
|
|
||
|
|
from .aliyun_docmind_gateway import AliyunDocmindGateway
|
||
|
|
from .aliyun_layout_normalizer import normalize_layouts
|
||
|
|
|
||
|
|
|
||
|
|
class AliyunDocumentParser:
|
||
|
|
"""Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
|
||
|
|
|
||
|
|
def __init__(self, gateway: AliyunDocmindGateway):
|
||
|
|
"""Store the gateway dependency used for remote parsing."""
|
||
|
|
self.gateway = gateway
|
||
|
|
|
||
|
|
def parse(self, pdf_path: Path) -> ParsedDocument:
|
||
|
|
"""Parse one PDF file into a normalized parsed-document model."""
|
||
|
|
payload = self.gateway.parse_document(pdf_path)
|
||
|
|
layouts = payload.get("layouts") or []
|
||
|
|
if not layouts:
|
||
|
|
raise ValueError(f"No layouts returned for document: {pdf_path.name}")
|
||
|
|
|
||
|
|
document = normalize_layouts(
|
||
|
|
doc_id=str(payload.get("doc_id") or pdf_path.stem),
|
||
|
|
doc_name=str(payload.get("doc_name") or pdf_path.name),
|
||
|
|
layouts=list(layouts),
|
||
|
|
)
|
||
|
|
document.metadata.update(
|
||
|
|
{
|
||
|
|
"task_id": payload.get("task_id"),
|
||
|
|
"provider": "aliyun_docmind",
|
||
|
|
}
|
||
|
|
)
|
||
|
|
return document
|