Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -0,0 +1,55 @@
"""Implement infrastructure support for aliyun document parser."""
from __future__ import annotations
from app.aliyun_parser.parse_pdf import (
MAX_CHARS,
OVERLAP_CHARS,
build_semantic_blocks,
build_structure_nodes,
build_vector_chunks,
collect_all_results,
init_client,
submit_job,
wait_for_completion,
)
from app.domain.documents import DocumentParser, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.
class AliyunDocumentParser(DocumentParser):
"""Provide the Aliyun Document Parser parser."""
parser_name = "aliyun_docmind"
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
"""Handle parse for the Aliyun Document Parser instance."""
client = init_client()
task_id = submit_job(client, file_path)
if not wait_for_completion(client, task_id):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
semantic_blocks,
doc_id=doc_id,
doc_title=doc_name,
max_chars=MAX_CHARS,
overlap_chars=OVERLAP_CHARS,
)
raw_text = "\n\n".join(
block.get("text", "")
for block in semantic_blocks
if block.get("text")
)
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
structure_nodes=structure_nodes,
semantic_blocks=semantic_blocks,
vector_chunks=vector_chunks,
parser_name=self.parser_name,
raw_text=raw_text,
metadata={"task_id": task_id, "layout_count": len(layouts)},
)