feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
ash66
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import os
import tempfile
import uuid
import json
from dataclasses import dataclass
from loguru import logger
@@ -16,6 +17,7 @@ from app.domain.documents import (
DocumentParser,
DocumentRepository,
DocumentStatus,
ParsedDocument,
)
from app.domain.retrieval import EmbeddingProvider, VectorIndex
# Keep orchestration logic centralized so use-case flow stays easy to trace.
@@ -54,6 +56,27 @@ class DocumentCommandService:
self.embedding_provider = embedding_provider
self.vector_index = vector_index
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
artifact_payloads = {
"layouts": parsed_document.raw_layouts,
"structure_nodes": parsed_document.structure_nodes,
"semantic_blocks": parsed_document.semantic_blocks,
"vector_chunks": parsed_document.vector_chunks,
}
artifact_keys: dict[str, str] = {}
for name, payload in artifact_payloads.items():
object_name = f"{prefix}/{name}.json"
self.binary_store.save(
object_name=object_name,
data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
content_type="application/json",
metadata={"doc_id": doc_id, "artifact_type": name},
)
artifact_keys[name] = object_name
return artifact_keys
def upload_and_process(
self,
*,
@@ -104,11 +127,21 @@ class DocumentCommandService:
doc_id=doc_id,
doc_name=final_doc_name,
)
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
self.document_repository.update_status(
doc_id,
DocumentStatus.PARSED,
parser_name=parsed_document.parser_name,
metadata={"structure_nodes": len(parsed_document.structure_nodes)},
metadata={
"parser_backend": parsed_document.parser_name,
"parse_task_id": parsed_document.metadata.get("task_id", ""),
"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
"structure_node_count": len(parsed_document.structure_nodes),
"semantic_block_count": len(parsed_document.semantic_blocks),
"vector_chunk_count": len(parsed_document.vector_chunks),
"artifact_keys": artifact_keys,
"processing_stage": "parsed",
},
)
chunks = self.chunk_builder.build(
@@ -124,13 +157,18 @@ class DocumentCommandService:
if inserted != len(chunks):
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
health = self.vector_index.health()
self.document_repository.update_status(
doc_id,
DocumentStatus.INDEXED,
chunk_count=len(chunks),
summary="",
summary_latency_ms=0,
index_name=self.vector_index.health().get("collection_name", ""),
index_name=health.get("collection_name", ""),
metadata={
"index_collection": health.get("collection_name", ""),
"processing_stage": "indexed",
},
)
stored = self.document_repository.get(doc_id)
return DocumentProcessResult(
@@ -148,6 +186,10 @@ class DocumentCommandService:
doc_id,
DocumentStatus.FAILED,
error_message=str(exc),
metadata={
"failure_reason": str(exc),
"processing_stage": "failed",
},
)
return DocumentProcessResult(
doc_id=doc_id,