Files
AIRegulation-DocAnalysis/backend/app/infrastructure/parser/aliyun_document_parser.py

64 lines
2.2 KiB
Python
Raw Normal View History

"""Implement infrastructure support for Aliyun document parsing."""
from __future__ import annotations
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
from app.infrastructure.parser.aliyun_layout_normalizer import (
MAX_CHARS,
OVERLAP_CHARS,
build_semantic_blocks,
build_structure_nodes,
build_vector_chunks,
)
# Keep adapter behavior explicit so integration details remain easy to audit.
class AliyunDocumentParser(DocumentParser):
"""Provide the Aliyun Document Parser parser."""
parser_name = "aliyun_docmind"
def __init__(self) -> None:
"""Initialize the parser adapter and its gateway dependency."""
self.gateway = AliyunDocmindGateway()
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
"""Handle parse for the Aliyun Document Parser instance."""
payload = self.gateway.parse_document(file_path=file_path)
layouts = payload.layouts
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_name,
max_chars=MAX_CHARS,
overlap_chars=OVERLAP_CHARS,
)
raw_text = "\n\n".join(
block.get("text", "")
for block in semantic_blocks
if block.get("text")
)
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
structure_nodes=structure_nodes,
semantic_blocks=semantic_blocks,
vector_chunks=vector_chunks,
parser_name=self.parser_name,
raw_text=raw_text,
raw_layouts=layouts,
metadata={
"task_id": payload.task_id,
"layout_count": len(layouts),
"poll_attempts": payload.poll_attempts,
"duration_ms": payload.duration_ms,
"parser_backend": self.parser_name,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
)