Files
siemens_ragas/rag_eval/dataset_builder/parser/aliyun_document_parser.py

39 lines
1.3 KiB
Python
Raw Normal View History

2026-06-12 14:02:15 +08:00
"""Document parser that normalizes Alibaba layout results into internal models."""
from __future__ import annotations
from pathlib import Path
from rag_eval.dataset_builder.models import ParsedDocument
from .aliyun_docmind_gateway import AliyunDocmindGateway
from .aliyun_layout_normalizer import normalize_layouts
class AliyunDocumentParser:
"""Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
def __init__(self, gateway: AliyunDocmindGateway):
"""Store the gateway dependency used for remote parsing."""
self.gateway = gateway
def parse(self, pdf_path: Path) -> ParsedDocument:
"""Parse one PDF file into a normalized parsed-document model."""
payload = self.gateway.parse_document(pdf_path)
layouts = payload.get("layouts") or []
if not layouts:
raise ValueError(f"No layouts returned for document: {pdf_path.name}")
document = normalize_layouts(
doc_id=str(payload.get("doc_id") or pdf_path.stem),
doc_name=str(payload.get("doc_name") or pdf_path.name),
layouts=list(layouts),
)
document.metadata.update(
{
"task_id": payload.get("task_id"),
"provider": "aliyun_docmind",
}
)
return document