2026-05-18 22:30:28 +08:00
|
|
|
"""Implement infrastructure support for Aliyun document parsing."""
|
2026-05-18 16:32:42 +08:00
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
from app.config.settings import settings
|
|
|
|
|
from app.domain.documents import DocumentParser, ParsedDocument
|
|
|
|
|
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
|
|
|
|
|
from app.infrastructure.parser.aliyun_layout_normalizer import (
|
2026-05-18 16:32:42 +08:00
|
|
|
MAX_CHARS,
|
|
|
|
|
OVERLAP_CHARS,
|
|
|
|
|
build_semantic_blocks,
|
|
|
|
|
build_structure_nodes,
|
|
|
|
|
build_vector_chunks,
|
|
|
|
|
)
|
2026-05-18 22:30:28 +08:00
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
# Keep adapter behavior explicit so integration details remain easy to audit.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AliyunDocumentParser(DocumentParser):
|
|
|
|
|
"""Provide the Aliyun Document Parser parser."""
|
|
|
|
|
parser_name = "aliyun_docmind"
|
|
|
|
|
|
2026-05-18 22:30:28 +08:00
|
|
|
def __init__(self) -> None:
|
|
|
|
|
"""Initialize the parser adapter and its gateway dependency."""
|
|
|
|
|
self.gateway = AliyunDocmindGateway()
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
|
|
|
|
"""Handle parse for the Aliyun Document Parser instance."""
|
2026-05-18 22:30:28 +08:00
|
|
|
payload = self.gateway.parse_document(file_path=file_path)
|
|
|
|
|
layouts = payload.layouts
|
2026-05-18 16:32:42 +08:00
|
|
|
structure_nodes = build_structure_nodes(layouts)
|
|
|
|
|
semantic_blocks = build_semantic_blocks(layouts)
|
|
|
|
|
vector_chunks = build_vector_chunks(
|
|
|
|
|
semantic_blocks,
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_title=doc_name,
|
|
|
|
|
max_chars=MAX_CHARS,
|
|
|
|
|
overlap_chars=OVERLAP_CHARS,
|
|
|
|
|
)
|
|
|
|
|
raw_text = "\n\n".join(
|
|
|
|
|
block.get("text", "")
|
|
|
|
|
for block in semantic_blocks
|
|
|
|
|
if block.get("text")
|
|
|
|
|
)
|
|
|
|
|
return ParsedDocument(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
doc_name=doc_name,
|
|
|
|
|
structure_nodes=structure_nodes,
|
|
|
|
|
semantic_blocks=semantic_blocks,
|
|
|
|
|
vector_chunks=vector_chunks,
|
|
|
|
|
parser_name=self.parser_name,
|
|
|
|
|
raw_text=raw_text,
|
2026-05-18 22:30:28 +08:00
|
|
|
raw_layouts=layouts,
|
|
|
|
|
metadata={
|
|
|
|
|
"task_id": payload.task_id,
|
|
|
|
|
"layout_count": len(layouts),
|
|
|
|
|
"poll_attempts": payload.poll_attempts,
|
|
|
|
|
"duration_ms": payload.duration_ms,
|
|
|
|
|
"parser_backend": self.parser_name,
|
|
|
|
|
"artifact_prefix": settings.document_parse_artifact_prefix,
|
|
|
|
|
},
|
2026-05-18 16:32:42 +08:00
|
|
|
)
|