"""Implement infrastructure support for aliyun document parser.""" from __future__ import annotations from app.aliyun_parser.parse_pdf import ( MAX_CHARS, OVERLAP_CHARS, build_semantic_blocks, build_structure_nodes, build_vector_chunks, collect_all_results, init_client, submit_job, wait_for_completion, ) from app.domain.documents import DocumentParser, ParsedDocument # Keep adapter behavior explicit so integration details remain easy to audit. class AliyunDocumentParser(DocumentParser): """Provide the Aliyun Document Parser parser.""" parser_name = "aliyun_docmind" def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument: """Handle parse for the Aliyun Document Parser instance.""" client = init_client() task_id = submit_job(client, file_path) if not wait_for_completion(client, task_id): raise RuntimeError("阿里云文档解析任务失败") layouts = collect_all_results(client, task_id) structure_nodes = build_structure_nodes(layouts) semantic_blocks = build_semantic_blocks(layouts) vector_chunks = build_vector_chunks( semantic_blocks, doc_id=doc_id, doc_title=doc_name, max_chars=MAX_CHARS, overlap_chars=OVERLAP_CHARS, ) raw_text = "\n\n".join( block.get("text", "") for block in semantic_blocks if block.get("text") ) return ParsedDocument( doc_id=doc_id, doc_name=doc_name, structure_nodes=structure_nodes, semantic_blocks=semantic_blocks, vector_chunks=vector_chunks, parser_name=self.parser_name, raw_text=raw_text, metadata={"task_id": task_id, "layout_count": len(layouts)}, )