feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -1,19 +1,18 @@
"""Implement infrastructure support for aliyun document parser."""
"""Implement infrastructure support for Aliyun document parsing."""
from __future__ import annotations
from app.aliyun_parser.parse_pdf import (
from app.config.settings import settings
from app.domain.documents import DocumentParser, ParsedDocument
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
from app.infrastructure.parser.aliyun_layout_normalizer import (
MAX_CHARS,
OVERLAP_CHARS,
build_semantic_blocks,
build_structure_nodes,
build_vector_chunks,
collect_all_results,
init_client,
submit_job,
wait_for_completion,
)
from app.domain.documents import DocumentParser, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.
@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
"""Provide the Aliyun Document Parser parser."""
parser_name = "aliyun_docmind"
def __init__(self) -> None:
"""Initialize the parser adapter and its gateway dependency."""
self.gateway = AliyunDocmindGateway()
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
"""Handle parse for the Aliyun Document Parser instance."""
client = init_client()
task_id = submit_job(client, file_path)
if not wait_for_completion(client, task_id):
raise RuntimeError("阿里云文档解析任务失败")
layouts = collect_all_results(client, task_id)
payload = self.gateway.parse_document(file_path=file_path)
layouts = payload.layouts
structure_nodes = build_structure_nodes(layouts)
semantic_blocks = build_semantic_blocks(layouts)
vector_chunks = build_vector_chunks(
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
vector_chunks=vector_chunks,
parser_name=self.parser_name,
raw_text=raw_text,
metadata={"task_id": task_id, "layout_count": len(layouts)},
raw_layouts=layouts,
metadata={
"task_id": payload.task_id,
"layout_count": len(layouts),
"poll_attempts": payload.poll_attempts,
"duration_ms": payload.duration_ms,
"parser_backend": self.parser_name,
"artifact_prefix": settings.document_parse_artifact_prefix,
},
)