"""Implement infrastructure support for Aliyun document parsing.""" from __future__ import annotations from app.config.settings import settings from app.domain.documents import DocumentParser, ParsedDocument from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway from app.infrastructure.parser.aliyun_layout_normalizer import ( MAX_CHARS, OVERLAP_CHARS, build_semantic_blocks, build_structure_nodes, build_vector_chunks, ) # Keep adapter behavior explicit so integration details remain easy to audit. class AliyunDocumentParser(DocumentParser): """Provide the Aliyun Document Parser parser.""" parser_name = "aliyun_docmind" def __init__(self) -> None: """Initialize the parser adapter and its gateway dependency.""" self.gateway = AliyunDocmindGateway() def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument: """Handle parse for the Aliyun Document Parser instance.""" payload = self.gateway.parse_document(file_path=file_path) layouts = payload.layouts structure_nodes = build_structure_nodes(layouts) semantic_blocks = build_semantic_blocks(layouts) vector_chunks = build_vector_chunks( semantic_blocks, doc_id=doc_id, doc_title=doc_name, max_chars=MAX_CHARS, overlap_chars=OVERLAP_CHARS, ) raw_text = "\n\n".join( block.get("text", "") for block in semantic_blocks if block.get("text") ) return ParsedDocument( doc_id=doc_id, doc_name=doc_name, structure_nodes=structure_nodes, semantic_blocks=semantic_blocks, vector_chunks=vector_chunks, parser_name=self.parser_name, raw_text=raw_text, raw_layouts=layouts, metadata={ "task_id": payload.task_id, "layout_count": len(layouts), "poll_attempts": payload.poll_attempts, "duration_ms": payload.duration_ms, "parser_backend": self.parser_name, "artifact_prefix": settings.document_parse_artifact_prefix, }, )