187 lines
6.5 KiB
Python
187 lines
6.5 KiB
Python
|
|
"""Implement application-layer logic for services."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import os
|
||
|
|
import tempfile
|
||
|
|
import uuid
|
||
|
|
from dataclasses import dataclass
|
||
|
|
|
||
|
|
from loguru import logger
|
||
|
|
|
||
|
|
from app.domain.documents import (
|
||
|
|
ChunkBuilder,
|
||
|
|
Document,
|
||
|
|
DocumentBinaryStore,
|
||
|
|
DocumentParser,
|
||
|
|
DocumentRepository,
|
||
|
|
DocumentStatus,
|
||
|
|
)
|
||
|
|
from app.domain.retrieval import EmbeddingProvider, VectorIndex
|
||
|
|
# Keep orchestration logic centralized so use-case flow stays easy to trace.
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class DocumentProcessResult:
|
||
|
|
"""Represent document process result data."""
|
||
|
|
doc_id: str
|
||
|
|
doc_name: str
|
||
|
|
status: str
|
||
|
|
message: str
|
||
|
|
num_chunks: int = 0
|
||
|
|
summary: str = ""
|
||
|
|
summary_latency_ms: int = 0
|
||
|
|
|
||
|
|
|
||
|
|
class DocumentCommandService:
|
||
|
|
"""Provide the Document Command Service service."""
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
*,
|
||
|
|
document_repository: DocumentRepository,
|
||
|
|
binary_store: DocumentBinaryStore,
|
||
|
|
parser: DocumentParser,
|
||
|
|
chunk_builder: ChunkBuilder,
|
||
|
|
embedding_provider: EmbeddingProvider,
|
||
|
|
vector_index: VectorIndex,
|
||
|
|
) -> None:
|
||
|
|
"""Initialize the Document Command Service instance."""
|
||
|
|
self.document_repository = document_repository
|
||
|
|
self.binary_store = binary_store
|
||
|
|
self.parser = parser
|
||
|
|
self.chunk_builder = chunk_builder
|
||
|
|
self.embedding_provider = embedding_provider
|
||
|
|
self.vector_index = vector_index
|
||
|
|
|
||
|
|
def upload_and_process(
|
||
|
|
self,
|
||
|
|
*,
|
||
|
|
doc_id: str | None = None,
|
||
|
|
file_name: str,
|
||
|
|
content: bytes,
|
||
|
|
content_type: str,
|
||
|
|
doc_name: str | None,
|
||
|
|
regulation_type: str,
|
||
|
|
version: str,
|
||
|
|
generate_summary: bool,
|
||
|
|
) -> DocumentProcessResult:
|
||
|
|
"""Handle upload and process for the Document Command Service instance."""
|
||
|
|
doc_id = doc_id or str(uuid.uuid4())[:8]
|
||
|
|
final_doc_name = doc_name or file_name
|
||
|
|
object_name = f"{doc_id}/{file_name}"
|
||
|
|
|
||
|
|
document = Document(
|
||
|
|
doc_id=doc_id,
|
||
|
|
doc_name=final_doc_name,
|
||
|
|
file_name=file_name,
|
||
|
|
object_name=object_name,
|
||
|
|
content_type=content_type,
|
||
|
|
size_bytes=len(content),
|
||
|
|
regulation_type=regulation_type,
|
||
|
|
version=version,
|
||
|
|
metadata={"generate_summary": generate_summary},
|
||
|
|
)
|
||
|
|
self.document_repository.create(document)
|
||
|
|
|
||
|
|
temp_path = ""
|
||
|
|
try:
|
||
|
|
self.binary_store.save(
|
||
|
|
object_name=object_name,
|
||
|
|
data=content,
|
||
|
|
content_type=content_type,
|
||
|
|
metadata={"doc_id": doc_id},
|
||
|
|
)
|
||
|
|
self.document_repository.update_status(doc_id, DocumentStatus.STORED)
|
||
|
|
|
||
|
|
suffix = os.path.splitext(file_name)[1]
|
||
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
||
|
|
temp_file.write(content)
|
||
|
|
temp_path = temp_file.name
|
||
|
|
|
||
|
|
parsed_document = self.parser.parse(
|
||
|
|
file_path=temp_path,
|
||
|
|
doc_id=doc_id,
|
||
|
|
doc_name=final_doc_name,
|
||
|
|
)
|
||
|
|
self.document_repository.update_status(
|
||
|
|
doc_id,
|
||
|
|
DocumentStatus.PARSED,
|
||
|
|
parser_name=parsed_document.parser_name,
|
||
|
|
metadata={"structure_nodes": len(parsed_document.structure_nodes)},
|
||
|
|
)
|
||
|
|
|
||
|
|
chunks = self.chunk_builder.build(
|
||
|
|
parsed_document=parsed_document,
|
||
|
|
regulation_type=regulation_type,
|
||
|
|
version=version,
|
||
|
|
)
|
||
|
|
if not chunks:
|
||
|
|
raise ValueError("解析完成但没有生成可入库的 chunks")
|
||
|
|
|
||
|
|
vectors = self.embedding_provider.embed_texts([chunk.embedding_text for chunk in chunks])
|
||
|
|
inserted = self.vector_index.upsert(chunks, vectors)
|
||
|
|
if inserted != len(chunks):
|
||
|
|
logger.warning("Milvus upsert count mismatched: inserted={}, chunks={}", inserted, len(chunks))
|
||
|
|
|
||
|
|
self.document_repository.update_status(
|
||
|
|
doc_id,
|
||
|
|
DocumentStatus.INDEXED,
|
||
|
|
chunk_count=len(chunks),
|
||
|
|
summary="",
|
||
|
|
summary_latency_ms=0,
|
||
|
|
index_name=self.vector_index.health().get("collection_name", ""),
|
||
|
|
)
|
||
|
|
stored = self.document_repository.get(doc_id)
|
||
|
|
return DocumentProcessResult(
|
||
|
|
doc_id=doc_id,
|
||
|
|
doc_name=final_doc_name,
|
||
|
|
status=(stored.status.value if stored else DocumentStatus.INDEXED.value),
|
||
|
|
message="处理成功",
|
||
|
|
num_chunks=len(chunks),
|
||
|
|
summary=stored.summary if stored else "",
|
||
|
|
summary_latency_ms=stored.summary_latency_ms if stored else 0,
|
||
|
|
)
|
||
|
|
except Exception as exc:
|
||
|
|
logger.exception("文档处理失败: doc_id={}", doc_id)
|
||
|
|
self.document_repository.update_status(
|
||
|
|
doc_id,
|
||
|
|
DocumentStatus.FAILED,
|
||
|
|
error_message=str(exc),
|
||
|
|
)
|
||
|
|
return DocumentProcessResult(
|
||
|
|
doc_id=doc_id,
|
||
|
|
doc_name=final_doc_name,
|
||
|
|
status=DocumentStatus.FAILED.value,
|
||
|
|
message=f"文档处理失败: {exc}",
|
||
|
|
)
|
||
|
|
finally:
|
||
|
|
if temp_path and os.path.exists(temp_path):
|
||
|
|
try:
|
||
|
|
os.remove(temp_path)
|
||
|
|
except OSError:
|
||
|
|
logger.warning("临时文件清理失败: {}", temp_path)
|
||
|
|
|
||
|
|
|
||
|
|
class DocumentQueryService:
|
||
|
|
"""Provide the Document Query Service service."""
|
||
|
|
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:
|
||
|
|
"""Initialize the Document Query Service instance."""
|
||
|
|
self.document_repository = document_repository
|
||
|
|
self.binary_store = binary_store
|
||
|
|
|
||
|
|
def get(self, doc_id: str) -> Document | None:
|
||
|
|
"""Handle get for the Document Query Service instance."""
|
||
|
|
return self.document_repository.get(doc_id)
|
||
|
|
|
||
|
|
def list_documents(self, limit: int | None = None) -> list[Document]:
|
||
|
|
"""List documents for the Document Query Service instance."""
|
||
|
|
return self.document_repository.list(limit=limit)
|
||
|
|
|
||
|
|
def download(self, doc_id: str) -> tuple[Document, bytes]:
|
||
|
|
"""Handle download for the Document Query Service instance."""
|
||
|
|
document = self.document_repository.get(doc_id)
|
||
|
|
if not document:
|
||
|
|
raise FileNotFoundError(f"文档不存在: {doc_id}")
|
||
|
|
return document, self.binary_store.read(document.object_name)
|