feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
@@ -1,19 +1,18 @@
|
||||
"""Implement infrastructure support for aliyun document parser."""
|
||||
"""Implement infrastructure support for Aliyun document parsing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.aliyun_parser.parse_pdf import (
|
||||
from app.config.settings import settings
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
from app.infrastructure.parser.aliyun_docmind_gateway import AliyunDocmindGateway
|
||||
from app.infrastructure.parser.aliyun_layout_normalizer import (
|
||||
MAX_CHARS,
|
||||
OVERLAP_CHARS,
|
||||
build_semantic_blocks,
|
||||
build_structure_nodes,
|
||||
build_vector_chunks,
|
||||
collect_all_results,
|
||||
init_client,
|
||||
submit_job,
|
||||
wait_for_completion,
|
||||
)
|
||||
from app.domain.documents import DocumentParser, ParsedDocument
|
||||
|
||||
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||||
|
||||
|
||||
@@ -22,13 +21,14 @@ class AliyunDocumentParser(DocumentParser):
|
||||
"""Provide the Aliyun Document Parser parser."""
|
||||
parser_name = "aliyun_docmind"
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the parser adapter and its gateway dependency."""
|
||||
self.gateway = AliyunDocmindGateway()
|
||||
|
||||
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
|
||||
"""Handle parse for the Aliyun Document Parser instance."""
|
||||
client = init_client()
|
||||
task_id = submit_job(client, file_path)
|
||||
if not wait_for_completion(client, task_id):
|
||||
raise RuntimeError("阿里云文档解析任务失败")
|
||||
layouts = collect_all_results(client, task_id)
|
||||
payload = self.gateway.parse_document(file_path=file_path)
|
||||
layouts = payload.layouts
|
||||
structure_nodes = build_structure_nodes(layouts)
|
||||
semantic_blocks = build_semantic_blocks(layouts)
|
||||
vector_chunks = build_vector_chunks(
|
||||
@@ -51,5 +51,13 @@ class AliyunDocumentParser(DocumentParser):
|
||||
vector_chunks=vector_chunks,
|
||||
parser_name=self.parser_name,
|
||||
raw_text=raw_text,
|
||||
metadata={"task_id": task_id, "layout_count": len(layouts)},
|
||||
raw_layouts=layouts,
|
||||
metadata={
|
||||
"task_id": payload.task_id,
|
||||
"layout_count": len(layouts),
|
||||
"poll_attempts": payload.poll_attempts,
|
||||
"duration_ms": payload.duration_ms,
|
||||
"parser_backend": self.parser_name,
|
||||
"artifact_prefix": settings.document_parse_artifact_prefix,
|
||||
},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user