feat: Migrate document parsing to Aliyun and update embedding configurations

- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00
parent 3f69cad404
commit c22b03dc07
26 changed files with 1092 additions and 6500 deletions
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 import os
 import tempfile
 import uuid
+import json
 from dataclasses import dataclass

 from loguru import logger
@@ -16,6 +17,7 @@ from app.domain.documents import (
    DocumentParser,
    DocumentRepository,
    DocumentStatus,
+    ParsedDocument,
 )
 from app.domain.retrieval import EmbeddingProvider, VectorIndex
 # Keep orchestration logic centralized so use-case flow stays easy to trace.
@@ -54,6 +56,27 @@ class DocumentCommandService:
        self.embedding_provider = embedding_provider
        self.vector_index = vector_index

+    def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
+        """Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
+        prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
+        artifact_payloads = {
+            "layouts": parsed_document.raw_layouts,
+            "structure_nodes": parsed_document.structure_nodes,
+            "semantic_blocks": parsed_document.semantic_blocks,
+            "vector_chunks": parsed_document.vector_chunks,
+        }
+        artifact_keys: dict[str, str] = {}
+        for name, payload in artifact_payloads.items():
+            object_name = f"{prefix}/{name}.json"
+            self.binary_store.save(
+                object_name=object_name,
+                data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
+                content_type="application/json",
+                metadata={"doc_id": doc_id, "artifact_type": name},
+            )
+            artifact_keys[name] = object_name
+        return artifact_keys
+
    def upload_and_process(
        self,
        *,
@@ -104,11 +127,21 @@ class DocumentCommandService:
                doc_id=doc_id,
                doc_name=final_doc_name,
            )
+            artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.PARSED,
                parser_name=parsed_document.parser_name,
-                metadata={"structure_nodes": len(parsed_document.structure_nodes)},
+                metadata={
+                    "parser_backend": parsed_document.parser_name,
+                    "parse_task_id": parsed_document.metadata.get("task_id", ""),
+                    "layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
+                    "structure_node_count": len(parsed_document.structure_nodes),
+                    "semantic_block_count": len(parsed_document.semantic_blocks),
+                    "vector_chunk_count": len(parsed_document.vector_chunks),
+                    "artifact_keys": artifact_keys,
+                    "processing_stage": "parsed",
+                },
            )

            chunks = self.chunk_builder.build(
@@ -124,13 +157,18 @@ class DocumentCommandService:
            if inserted != len(chunks):
                logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))

+            health = self.vector_index.health()
            self.document_repository.update_status(
                doc_id,
                DocumentStatus.INDEXED,
                chunk_count=len(chunks),
                summary="",
                summary_latency_ms=0,
-                index_name=self.vector_index.health().get("collection_name", ""),
+                index_name=health.get("collection_name", ""),
+                metadata={
+                    "index_collection": health.get("collection_name", ""),
+                    "processing_stage": "indexed",
+                },
            )
            stored = self.document_repository.get(doc_id)
            return DocumentProcessResult(
@@ -148,6 +186,10 @@ class DocumentCommandService:
                doc_id,
                DocumentStatus.FAILED,
                error_message=str(exc),
+                metadata={
+                    "failure_reason": str(exc),
+                    "processing_stage": "failed",
+                },
            )
            return DocumentProcessResult(
                doc_id=doc_id,