first commit
This commit is contained in:
38
rag_eval/dataset_builder/parser/aliyun_document_parser.py
Normal file
38
rag_eval/dataset_builder/parser/aliyun_document_parser.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Document parser that normalizes Alibaba layout results into internal models."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.dataset_builder.models import ParsedDocument
|
||||
|
||||
from .aliyun_docmind_gateway import AliyunDocmindGateway
|
||||
from .aliyun_layout_normalizer import normalize_layouts
|
||||
|
||||
|
||||
class AliyunDocumentParser:
|
||||
"""Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
|
||||
|
||||
def __init__(self, gateway: AliyunDocmindGateway):
|
||||
"""Store the gateway dependency used for remote parsing."""
|
||||
self.gateway = gateway
|
||||
|
||||
def parse(self, pdf_path: Path) -> ParsedDocument:
|
||||
"""Parse one PDF file into a normalized parsed-document model."""
|
||||
payload = self.gateway.parse_document(pdf_path)
|
||||
layouts = payload.get("layouts") or []
|
||||
if not layouts:
|
||||
raise ValueError(f"No layouts returned for document: {pdf_path.name}")
|
||||
|
||||
document = normalize_layouts(
|
||||
doc_id=str(payload.get("doc_id") or pdf_path.stem),
|
||||
doc_name=str(payload.get("doc_name") or pdf_path.name),
|
||||
layouts=list(layouts),
|
||||
)
|
||||
document.metadata.update(
|
||||
{
|
||||
"task_id": payload.get("task_id"),
|
||||
"provider": "aliyun_docmind",
|
||||
}
|
||||
)
|
||||
return document
|
||||
Reference in New Issue
Block a user