first commit

This commit is contained in:
2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
"""Document parser that normalizes Alibaba layout results into internal models."""
from __future__ import annotations
from pathlib import Path
from rag_eval.dataset_builder.models import ParsedDocument
from .aliyun_docmind_gateway import AliyunDocmindGateway
from .aliyun_layout_normalizer import normalize_layouts
class AliyunDocumentParser:
"""Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
def __init__(self, gateway: AliyunDocmindGateway):
"""Store the gateway dependency used for remote parsing."""
self.gateway = gateway
def parse(self, pdf_path: Path) -> ParsedDocument:
"""Parse one PDF file into a normalized parsed-document model."""
payload = self.gateway.parse_document(pdf_path)
layouts = payload.get("layouts") or []
if not layouts:
raise ValueError(f"No layouts returned for document: {pdf_path.name}")
document = normalize_layouts(
doc_id=str(payload.get("doc_id") or pdf_path.stem),
doc_name=str(payload.get("doc_name") or pdf_path.name),
layouts=list(layouts),
)
document.metadata.update(
{
"task_id": payload.get("task_id"),
"provider": "aliyun_docmind",
}
)
return document