"""Implement application-layer logic for services.""" from __future__ import annotations import os import tempfile import uuid from dataclasses import dataclass from loguru import logger from app.domain.documents import ( ChunkBuilder, Document, DocumentBinaryStore, DocumentParser, DocumentRepository, DocumentStatus, ) from app.domain.retrieval import EmbeddingProvider, VectorIndex # Keep orchestration logic centralized so use-case flow stays easy to trace. @dataclass class DocumentProcessResult: """Represent document process result data.""" doc_id: str doc_name: str status: str message: str num_chunks: int = 0 summary: str = "" summary_latency_ms: int = 0 class DocumentCommandService: """Provide the Document Command Service service.""" def __init__( self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore, parser: DocumentParser, chunk_builder: ChunkBuilder, embedding_provider: EmbeddingProvider, vector_index: VectorIndex, ) -> None: """Initialize the Document Command Service instance.""" self.document_repository = document_repository self.binary_store = binary_store self.parser = parser self.chunk_builder = chunk_builder self.embedding_provider = embedding_provider self.vector_index = vector_index def upload_and_process( self, *, doc_id: str | None = None, file_name: str, content: bytes, content_type: str, doc_name: str | None, regulation_type: str, version: str, generate_summary: bool, ) -> DocumentProcessResult: """Handle upload and process for the Document Command Service instance.""" doc_id = doc_id or str(uuid.uuid4())[:8] final_doc_name = doc_name or file_name object_name = f"{doc_id}/{file_name}" document = Document( doc_id=doc_id, doc_name=final_doc_name, file_name=file_name, object_name=object_name, content_type=content_type, size_bytes=len(content), regulation_type=regulation_type, version=version, metadata={"generate_summary": generate_summary}, ) self.document_repository.create(document) temp_path = "" try: self.binary_store.save( object_name=object_name, data=content, content_type=content_type, metadata={"doc_id": doc_id}, ) self.document_repository.update_status(doc_id, DocumentStatus.STORED) suffix = os.path.splitext(file_name)[1] with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: temp_file.write(content) temp_path = temp_file.name parsed_document = self.parser.parse( file_path=temp_path, doc_id=doc_id, doc_name=final_doc_name, ) self.document_repository.update_status( doc_id, DocumentStatus.PARSED, parser_name=parsed_document.parser_name, metadata={"structure_nodes": len(parsed_document.structure_nodes)}, ) chunks = self.chunk_builder.build( parsed_document=parsed_document, regulation_type=regulation_type, version=version, ) if not chunks: raise ValueError("解析完成但没有生成可入库的 chunks") vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks]) inserted = self.vector_index.upsert(chunks, vectors) if inserted != len(chunks): logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks)) self.document_repository.update_status( doc_id, DocumentStatus.INDEXED, chunk_count=len(chunks), summary="", summary_latency_ms=0, index_name=self.vector_index.health().get("collection_name", ""), ) stored = self.document_repository.get(doc_id) return DocumentProcessResult( doc_id=doc_id, doc_name=final_doc_name, status=(stored.status.value if stored else DocumentStatus.INDEXED.value), message="处理成功", num_chunks=len(chunks), summary=stored.summary if stored else "", summary_latency_ms=stored.summary_latency_ms if stored else 0, ) except Exception as exc: logger.exception("文档处理失败: doc_id={}", doc_id) self.document_repository.update_status( doc_id, DocumentStatus.FAILED, error_message=str(exc), ) return DocumentProcessResult( doc_id=doc_id, doc_name=final_doc_name, status=DocumentStatus.FAILED.value, message=f"文档处理失败: {exc}", ) finally: if temp_path and os.path.exists(temp_path): try: os.remove(temp_path) except OSError: logger.warning("临时文件清理失败: {}", temp_path) class DocumentQueryService: """Provide the Document Query Service service.""" def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None: """Initialize the Document Query Service instance.""" self.document_repository = document_repository self.binary_store = binary_store def get(self, doc_id: str) -> Document | None: """Handle get for the Document Query Service instance.""" return self.document_repository.get(doc_id) def list_documents(self, limit: int | None = None) -> list[Document]: """List documents for the Document Query Service instance.""" return self.document_repository.list(limit=limit) def download(self, doc_id: str) -> tuple[Document, bytes]: """Handle download for the Document Query Service instance.""" document = self.document_repository.get(doc_id) if not document: raise FileNotFoundError(f"文档不存在: {doc_id}") return document, self.binary_store.read(document.object_name)