feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -0,0 +1,142 @@
"""Aliyun Docmind gateway helpers for the document ingest pipeline."""
from __future__ import annotations
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_tea_util import models as util_models
from app.config.settings import settings
# Keep provider-specific behavior isolated so the rest of the backend can stay stable.
@dataclass
class AliyunParsePayload:
"""Represent the raw Aliyun parse payload returned by the gateway."""
task_id: str
layouts: list[dict[str, Any]]
poll_attempts: int
duration_ms: int
class AliyunDocmindGateway:
"""Submit, poll, and collect results from the Aliyun Docmind API."""
def __init__(self) -> None:
"""Initialize the gateway with runtime configuration."""
self.endpoint = settings.alibaba_endpoint
self.poll_interval_seconds = settings.aliyun_parse_poll_interval_seconds
self.timeout_seconds = settings.aliyun_parse_timeout_seconds
self.layout_step_size = settings.aliyun_parse_layout_step_size
self.llm_enhancement = settings.aliyun_llm_enhancement
self.enhancement_mode = settings.aliyun_enhancement_mode
def parse_document(self, *, file_path: str) -> AliyunParsePayload:
"""Parse a single document and return the collected layouts."""
client = self._create_client()
started_at = time.monotonic()
task_id = self._submit_job(client=client, file_path=file_path)
poll_attempts = self._wait_for_completion(client=client, task_id=task_id, started_at=started_at)
layouts = self._collect_all_results(client=client, task_id=task_id)
duration_ms = int((time.monotonic() - started_at) * 1000)
return AliyunParsePayload(
task_id=task_id,
layouts=layouts,
poll_attempts=poll_attempts,
duration_ms=duration_ms,
)
def _create_client(self) -> DocmindClient:
"""Create a Docmind client using explicit AccessKey settings only."""
config = open_api_models.Config()
config.endpoint = self.endpoint
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
raise ValueError(
"Missing Aliyun parser credentials. Set ALIBABA_ACCESS_KEY_ID and "
"ALIBABA_ACCESS_KEY_SECRET in the project root .env."
)
# Keep production behavior deterministic by using only project-configured credentials.
config.access_key_id = settings.alibaba_access_key_id
config.access_key_secret = settings.alibaba_access_key_secret
return DocmindClient(config)
def _submit_job(self, *, client: DocmindClient, file_path: str) -> str:
"""Submit an asynchronous Docmind parse job."""
path = Path(file_path)
with open(file_path, "rb") as file_stream:
request = docmind_models.SubmitDocParserJobAdvanceRequest(
file_url_object=file_stream,
file_name=path.name,
file_name_extension=path.suffix.lstrip("."),
llm_enhancement=self.llm_enhancement,
enhancement_mode=self.enhancement_mode,
)
runtime = util_models.RuntimeOptions()
response = client.submit_doc_parser_job_advance(request, runtime)
task_id = response.body.data.id if response.body and response.body.data else ""
if not task_id:
raise RuntimeError("Aliyun Docmind did not return a parse task id.")
return task_id
def _query_status(self, *, client: DocmindClient, task_id: str) -> dict[str, Any] | None:
"""Query the current Docmind parse status."""
request = docmind_models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
return response.body.data.to_map() if response.body and response.body.data else None
def _wait_for_completion(self, *, client: DocmindClient, task_id: str, started_at: float) -> int:
"""Poll until the parse job finishes or times out."""
poll_attempts = 0
while True:
poll_attempts += 1
status_payload = self._query_status(client=client, task_id=task_id)
if not status_payload:
raise RuntimeError(f"Aliyun parse status payload is empty for task {task_id}.")
status = str(status_payload.get("Status", "")).lower()
if status == "success":
return poll_attempts
if status == "failed":
raise RuntimeError(f"Aliyun parse task failed: {status_payload}")
elapsed = time.monotonic() - started_at
if elapsed > self.timeout_seconds:
raise TimeoutError(
f"Aliyun parse task timed out after {self.timeout_seconds}s: task_id={task_id}"
)
time.sleep(self.poll_interval_seconds)
def _collect_all_results(self, *, client: DocmindClient, task_id: str) -> list[dict[str, Any]]:
"""Collect all paginated layout results from a completed parse task."""
all_layouts: list[dict[str, Any]] = []
layout_num = 0
while True:
request = docmind_models.GetDocParserResultRequest(
id=task_id,
layout_step_size=self.layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
payload = response.body.data if response.body else None
if not payload:
break
layouts = payload.get("layouts", [])
if not layouts:
break
all_layouts.extend(layouts)
layout_num += len(layouts)
if len(layouts) < self.layout_step_size:
break
if not all_layouts:
raise RuntimeError(f"Aliyun parse task returned no layouts: task_id={task_id}")
return all_layouts