Fix SSE route dependency and align architecture docs
This commit is contained in:
55
backend/app/infrastructure/parser/aliyun_document_parser.py
Normal file
55
backend/app/infrastructure/parser/aliyun_document_parser.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Implement infrastructure support for aliyun document parser."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.aliyun_parser.parse_pdf import (
|
||||
MAX_CHARS,
|
||||
OVERLAP_CHARS,
|
||||
build_semantic_blocks,
|
||||
build_structure_nodes,
|
||||
build_vector_chunks,
|
||||
collect_all_results,
|
||||
init_client,
|
||||
submit_job,
|
||||
wait_for_completion,
|
||||
)
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
|
||||
class AliyunDocumentParser(DocumentParser):
|
||||
"""Provide the Aliyun Document Parser parser."""
|
||||
parser_name = "aliyun_docmind"
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
"""Handle parse for the Aliyun Document Parser instance."""
|
||||
client = init_client()
|
||||
task_id = submit_job(client, file_path)
|
||||
if not wait_for_completion(client, task_id):
|
||||
raise RuntimeError("阿里云文档解析任务失败")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
semantic_blocks,
|
||||
doc_id=doc_id,
|
||||
doc_title=doc_name,
|
||||
max_chars=MAX_CHARS,
|
||||
overlap_chars=OVERLAP_CHARS,
|
||||
)
|
||||
raw_text = "\n\n".join(
|
||||
block.get("text", "")
|
||||
for block in semantic_blocks
|
||||
if block.get("text")
|
||||
)
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id,
|
||||
doc_name=doc_name,
|
||||
structure_nodes=structure_nodes,
|
||||
semantic_blocks=semantic_blocks,
|
||||
vector_chunks=vector_chunks,
|
||||
parser_name=self.parser_name,
|
||||
raw_text=raw_text,
|
||||
metadata={"task_id": task_id, "layout_count": len(layouts)},
|
||||
)
|
||||
Reference in New Issue
Block a user