feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
This commit is contained in:
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
|
||||
from loguru import logger
|
||||
@@ -16,6 +17,7 @@ from app.domain.documents import (
|
||||
DocumentParser,
|
||||
DocumentRepository,
|
||||
DocumentStatus,
|
||||
ParsedDocument,
|
||||
)
|
||||
from app.domain.retrieval import EmbeddingProvider, VectorIndex
|
||||
# Keep orchestration logic centralized so use-case flow stays easy to trace.
|
||||
@@ -54,6 +56,27 @@ class DocumentCommandService:
|
||||
self.embedding_provider = embedding_provider
|
||||
self.vector_index = vector_index
|
||||
|
||||
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
|
||||
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
|
||||
prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
|
||||
artifact_payloads = {
|
||||
"layouts": parsed_document.raw_layouts,
|
||||
"structure_nodes": parsed_document.structure_nodes,
|
||||
"semantic_blocks": parsed_document.semantic_blocks,
|
||||
"vector_chunks": parsed_document.vector_chunks,
|
||||
}
|
||||
artifact_keys: dict[str, str] = {}
|
||||
for name, payload in artifact_payloads.items():
|
||||
object_name = f"{prefix}/{name}.json"
|
||||
self.binary_store.save(
|
||||
object_name=object_name,
|
||||
data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
|
||||
content_type="application/json",
|
||||
metadata={"doc_id": doc_id, "artifact_type": name},
|
||||
)
|
||||
artifact_keys[name] = object_name
|
||||
return artifact_keys
|
||||
|
||||
def upload_and_process(
|
||||
self,
|
||||
*,
|
||||
@@ -104,11 +127,21 @@ class DocumentCommandService:
|
||||
doc_id=doc_id,
|
||||
doc_name=final_doc_name,
|
||||
)
|
||||
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
|
||||
self.document_repository.update_status(
|
||||
doc_id,
|
||||
DocumentStatus.PARSED,
|
||||
parser_name=parsed_document.parser_name,
|
||||
metadata={"structure_nodes": len(parsed_document.structure_nodes)},
|
||||
metadata={
|
||||
"parser_backend": parsed_document.parser_name,
|
||||
"parse_task_id": parsed_document.metadata.get("task_id", ""),
|
||||
"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
|
||||
"structure_node_count": len(parsed_document.structure_nodes),
|
||||
"semantic_block_count": len(parsed_document.semantic_blocks),
|
||||
"vector_chunk_count": len(parsed_document.vector_chunks),
|
||||
"artifact_keys": artifact_keys,
|
||||
"processing_stage": "parsed",
|
||||
},
|
||||
)
|
||||
|
||||
chunks = self.chunk_builder.build(
|
||||
@@ -124,13 +157,18 @@ class DocumentCommandService:
|
||||
if inserted != len(chunks):
|
||||
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
|
||||
|
||||
health = self.vector_index.health()
|
||||
self.document_repository.update_status(
|
||||
doc_id,
|
||||
DocumentStatus.INDEXED,
|
||||
chunk_count=len(chunks),
|
||||
summary="",
|
||||
summary_latency_ms=0,
|
||||
index_name=self.vector_index.health().get("collection_name", ""),
|
||||
index_name=health.get("collection_name", ""),
|
||||
metadata={
|
||||
"index_collection": health.get("collection_name", ""),
|
||||
"processing_stage": "indexed",
|
||||
},
|
||||
)
|
||||
stored = self.document_repository.get(doc_id)
|
||||
return DocumentProcessResult(
|
||||
@@ -148,6 +186,10 @@ class DocumentCommandService:
|
||||
doc_id,
|
||||
DocumentStatus.FAILED,
|
||||
error_message=str(exc),
|
||||
metadata={
|
||||
"failure_reason": str(exc),
|
||||
"processing_stage": "failed",
|
||||
},
|
||||
)
|
||||
return DocumentProcessResult(
|
||||
doc_id=doc_id,
|
||||
|
||||
Reference in New Issue
Block a user