Files
AIRegulation-DocAnalysis/backend/app/application/documents/services.py
ash66 c22b03dc07 feat: Migrate document parsing to Aliyun and update embedding configurations
- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings.
- Added new documents with failure reasons and metadata to documents.json for better error tracking.
- Created a new documentation file detailing the Aliyun ingest implementation process.
- Updated RFC to reflect changes in the parsing backend and embedding dimensions.
- Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions.
- Verified migration configurations to ensure correct settings for embedding model and backend.
2026-05-18 22:30:28 +08:00

229 lines
8.6 KiB
Python

"""Implement application-layer logic for services."""
from __future__ import annotations
import os
import tempfile
import uuid
import json
from dataclasses import dataclass
from loguru import logger
from app.domain.documents import (
ChunkBuilder,
Document,
DocumentBinaryStore,
DocumentParser,
DocumentRepository,
DocumentStatus,
ParsedDocument,
)
from app.domain.retrieval import EmbeddingProvider, VectorIndex
# Keep orchestration logic centralized so use-case flow stays easy to trace.
@dataclass
class DocumentProcessResult:
"""Represent document process result data."""
doc_id: str
doc_name: str
status: str
message: str
num_chunks: int = 0
summary: str = ""
summary_latency_ms: int = 0
class DocumentCommandService:
"""Provide the Document Command Service service."""
def __init__(
self,
*,
document_repository: DocumentRepository,
binary_store: DocumentBinaryStore,
parser: DocumentParser,
chunk_builder: ChunkBuilder,
embedding_provider: EmbeddingProvider,
vector_index: VectorIndex,
) -> None:
"""Initialize the Document Command Service instance."""
self.document_repository = document_repository
self.binary_store = binary_store
self.parser = parser
self.chunk_builder = chunk_builder
self.embedding_provider = embedding_provider
self.vector_index = vector_index
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
prefix = f"{parsed_document.metadata.get('artifact_prefix', 'artifacts').strip('/')}/{doc_id}"
artifact_payloads = {
"layouts": parsed_document.raw_layouts,
"structure_nodes": parsed_document.structure_nodes,
"semantic_blocks": parsed_document.semantic_blocks,
"vector_chunks": parsed_document.vector_chunks,
}
artifact_keys: dict[str, str] = {}
for name, payload in artifact_payloads.items():
object_name = f"{prefix}/{name}.json"
self.binary_store.save(
object_name=object_name,
data=json.dumps(payload, ensure_ascii=False, indent=2).encode("utf-8"),
content_type="application/json",
metadata={"doc_id": doc_id, "artifact_type": name},
)
artifact_keys[name] = object_name
return artifact_keys
def upload_and_process(
self,
*,
doc_id: str | None = None,
file_name: str,
content: bytes,
content_type: str,
doc_name: str | None,
regulation_type: str,
version: str,
generate_summary: bool,
) -> DocumentProcessResult:
"""Handle upload and process for the Document Command Service instance."""
doc_id = doc_id or str(uuid.uuid4())[:8]
final_doc_name = doc_name or file_name
object_name = f"{doc_id}/{file_name}"
document = Document(
doc_id=doc_id,
doc_name=final_doc_name,
file_name=file_name,
object_name=object_name,
content_type=content_type,
size_bytes=len(content),
regulation_type=regulation_type,
version=version,
metadata={"generate_summary": generate_summary},
)
self.document_repository.create(document)
temp_path = ""
try:
self.binary_store.save(
object_name=object_name,
data=content,
content_type=content_type,
metadata={"doc_id": doc_id},
)
self.document_repository.update_status(doc_id, DocumentStatus.STORED)
suffix = os.path.splitext(file_name)[1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
temp_file.write(content)
temp_path = temp_file.name
parsed_document = self.parser.parse(
file_path=temp_path,
doc_id=doc_id,
doc_name=final_doc_name,
)
artifact_keys = self._save_parse_artifacts(doc_id=doc_id, parsed_document=parsed_document)
self.document_repository.update_status(
doc_id,
DocumentStatus.PARSED,
parser_name=parsed_document.parser_name,
metadata={
"parser_backend": parsed_document.parser_name,
"parse_task_id": parsed_document.metadata.get("task_id", ""),
"layout_count": parsed_document.metadata.get("layout_count", len(parsed_document.raw_layouts)),
"structure_node_count": len(parsed_document.structure_nodes),
"semantic_block_count": len(parsed_document.semantic_blocks),
"vector_chunk_count": len(parsed_document.vector_chunks),
"artifact_keys": artifact_keys,
"processing_stage": "parsed",
},
)
chunks = self.chunk_builder.build(
parsed_document=parsed_document,
regulation_type=regulation_type,
version=version,
)
if not chunks:
raise ValueError("解析完成但没有生成可入库的 chunks")
vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
inserted = self.vector_index.upsert(chunks, vectors)
if inserted != len(chunks):
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
health = self.vector_index.health()
self.document_repository.update_status(
doc_id,
DocumentStatus.INDEXED,
chunk_count=len(chunks),
summary="",
summary_latency_ms=0,
index_name=health.get("collection_name", ""),
metadata={
"index_collection": health.get("collection_name", ""),
"processing_stage": "indexed",
},
)
stored = self.document_repository.get(doc_id)
return DocumentProcessResult(
doc_id=doc_id,
doc_name=final_doc_name,
status=(stored.status.value if stored else DocumentStatus.INDEXED.value),
message="处理成功",
num_chunks=len(chunks),
summary=stored.summary if stored else "",
summary_latency_ms=stored.summary_latency_ms if stored else 0,
)
except Exception as exc:
logger.exception("文档处理失败: doc_id={}", doc_id)
self.document_repository.update_status(
doc_id,
DocumentStatus.FAILED,
error_message=str(exc),
metadata={
"failure_reason": str(exc),
"processing_stage": "failed",
},
)
return DocumentProcessResult(
doc_id=doc_id,
doc_name=final_doc_name,
status=DocumentStatus.FAILED.value,
message=f"文档处理失败: {exc}",
)
finally:
if temp_path and os.path.exists(temp_path):
try:
os.remove(temp_path)
except OSError:
logger.warning("临时文件清理失败: {}", temp_path)
class DocumentQueryService:
"""Provide the Document Query Service service."""
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:
"""Initialize the Document Query Service instance."""
self.document_repository = document_repository
self.binary_store = binary_store
def get(self, doc_id: str) -> Document | None:
"""Handle get for the Document Query Service instance."""
return self.document_repository.get(doc_id)
def list_documents(self, limit: int | None = None) -> list[Document]:
"""List documents for the Document Query Service instance."""
return self.document_repository.list(limit=limit)
def download(self, doc_id: str) -> tuple[Document, bytes]:
"""Handle download for the Document Query Service instance."""
document = self.document_repository.get(doc_id)
if not document:
raise FileNotFoundError(f"文档不存在: {doc_id}")
return document, self.binary_store.read(document.object_name)