fix 文档管理模块 & 法规对话模块
This commit is contained in:
@@ -17,6 +17,7 @@ from app.domain.documents import (
|
||||
DocumentParser,
|
||||
DocumentRepository,
|
||||
DocumentStatus,
|
||||
ParseArtifactStore,
|
||||
ParsedDocument,
|
||||
)
|
||||
from app.domain.retrieval import EmbeddingProvider, VectorIndex
|
||||
@@ -47,6 +48,7 @@ class DocumentCommandService:
|
||||
chunk_builder: ChunkBuilder,
|
||||
embedding_provider: EmbeddingProvider,
|
||||
vector_index: VectorIndex,
|
||||
parse_artifact_store: ParseArtifactStore | None = None,
|
||||
) -> None:
|
||||
"""Initialize the Document Command Service instance."""
|
||||
self.document_repository = document_repository
|
||||
@@ -55,6 +57,7 @@ class DocumentCommandService:
|
||||
self.chunk_builder = chunk_builder
|
||||
self.embedding_provider = embedding_provider
|
||||
self.vector_index = vector_index
|
||||
self.parse_artifact_store = parse_artifact_store
|
||||
|
||||
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
|
||||
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
|
||||
@@ -143,6 +146,15 @@ class DocumentCommandService:
|
||||
"processing_stage": "parsed",
|
||||
},
|
||||
)
|
||||
if self.parse_artifact_store:
|
||||
try:
|
||||
self.parse_artifact_store.save(
|
||||
doc_id,
|
||||
parsed_document.structure_nodes,
|
||||
parsed_document.semantic_blocks,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("ParseArtifactStore.save failed for doc_id={}", doc_id)
|
||||
|
||||
chunks = self.chunk_builder.build(
|
||||
parsed_document=parsed_document,
|
||||
@@ -205,20 +217,120 @@ class DocumentCommandService:
|
||||
logger.warning("临时文件清理失败: {}", temp_path)
|
||||
|
||||
|
||||
def delete(self, doc_id: str) -> bool:
|
||||
"""Delete document record, binary file, and vector chunks."""
|
||||
document = self.document_repository.get(doc_id)
|
||||
if not document:
|
||||
return False
|
||||
try:
|
||||
self.binary_store.delete(document.object_name)
|
||||
except Exception:
|
||||
logger.warning("Binary delete failed for doc_id={}", doc_id)
|
||||
try:
|
||||
self.vector_index.delete_by_document(doc_id)
|
||||
except Exception:
|
||||
logger.warning("Vector delete failed for doc_id={}", doc_id)
|
||||
if self.parse_artifact_store:
|
||||
try:
|
||||
self.parse_artifact_store.delete(doc_id)
|
||||
except Exception:
|
||||
logger.warning("ParseArtifactStore delete failed for doc_id={}", doc_id)
|
||||
self.document_repository.delete(doc_id)
|
||||
return True
|
||||
|
||||
def retry(self, doc_id: str) -> DocumentProcessResult:
|
||||
"""Re-process a failed document from its stored binary."""
|
||||
document = self.document_repository.get(doc_id)
|
||||
if not document:
|
||||
return DocumentProcessResult(doc_id=doc_id, doc_name="", status="failed", message="文档不存在")
|
||||
content = self.binary_store.read(document.object_name)
|
||||
return self.upload_and_process(
|
||||
doc_id=doc_id,
|
||||
file_name=document.file_name,
|
||||
content=content,
|
||||
content_type=document.content_type,
|
||||
doc_name=document.doc_name,
|
||||
regulation_type=document.regulation_type,
|
||||
version=document.version,
|
||||
generate_summary=bool(document.metadata.get("generate_summary", False)),
|
||||
)
|
||||
|
||||
|
||||
class DocumentQueryService:
|
||||
"""Provide the Document Query Service service."""
|
||||
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:
|
||||
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore, vector_index: VectorIndex) -> None:
|
||||
"""Initialize the Document Query Service instance."""
|
||||
self.document_repository = document_repository
|
||||
self.binary_store = binary_store
|
||||
self.vector_index = vector_index
|
||||
|
||||
def get(self, doc_id: str) -> Document | None:
|
||||
"""Handle get for the Document Query Service instance."""
|
||||
return self.document_repository.get(doc_id)
|
||||
|
||||
def list_documents(self, limit: int | None = None) -> list[Document]:
|
||||
"""List documents for the Document Query Service instance."""
|
||||
return self.document_repository.list(limit=limit)
|
||||
"""Return documents with real-time state from Milvus as the authoritative source.
|
||||
|
||||
Algorithm:
|
||||
1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
|
||||
2. Load JSON/PG metadata records and index them by doc_id.
|
||||
3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
|
||||
metadata-only docs with status=INDEXED are demoted to FAILED.
|
||||
4. Milvus-only docs (no metadata record) are surfaced as synthetic INDEXED
|
||||
entries so they are never invisible to the management list.
|
||||
"""
|
||||
# Fetch live Milvus state first.
|
||||
try:
|
||||
milvus_rows = self.vector_index.list_document_metadata()
|
||||
except Exception:
|
||||
milvus_rows = []
|
||||
|
||||
milvus_by_id: dict[str, dict] = {r["doc_id"]: r for r in milvus_rows}
|
||||
|
||||
# Load metadata store records.
|
||||
meta_docs = self.document_repository.list(limit=limit)
|
||||
meta_by_id: dict[str, Document] = {d.doc_id: d for d in meta_docs}
|
||||
|
||||
result: list[Document] = []
|
||||
|
||||
# Reconcile metadata records against Milvus.
|
||||
for doc in meta_docs:
|
||||
if doc.doc_id in milvus_by_id:
|
||||
row = milvus_by_id[doc.doc_id]
|
||||
doc.chunk_count = row["chunk_count"]
|
||||
doc.status = DocumentStatus.INDEXED
|
||||
# Backfill fields that may be missing from older JSON records.
|
||||
if not doc.doc_name and row.get("doc_name"):
|
||||
doc.doc_name = row["doc_name"]
|
||||
if not doc.regulation_type and row.get("regulation_type"):
|
||||
doc.regulation_type = row["regulation_type"]
|
||||
if not doc.version and row.get("version"):
|
||||
doc.version = row["version"]
|
||||
elif doc.status == DocumentStatus.INDEXED:
|
||||
# Metadata says indexed but Milvus has no chunks.
|
||||
doc.status = DocumentStatus.FAILED
|
||||
doc.error_message = "向量数据库中未找到对应数据"
|
||||
result.append(doc)
|
||||
|
||||
# Surface Milvus-only docs that have no metadata record at all.
|
||||
for doc_id, row in milvus_by_id.items():
|
||||
if doc_id not in meta_by_id:
|
||||
synthetic = Document(
|
||||
doc_id=doc_id,
|
||||
doc_name=row.get("doc_name", doc_id),
|
||||
file_name=row.get("doc_name", doc_id),
|
||||
object_name="",
|
||||
content_type="",
|
||||
size_bytes=0,
|
||||
status=DocumentStatus.INDEXED,
|
||||
regulation_type=row.get("regulation_type", ""),
|
||||
version=row.get("version", ""),
|
||||
chunk_count=row["chunk_count"],
|
||||
)
|
||||
result.append(synthetic)
|
||||
|
||||
result.sort(key=lambda d: d.updated_at, reverse=True)
|
||||
return result[:limit] if limit is not None else result
|
||||
|
||||
def download(self, doc_id: str) -> tuple[Document, bytes]:
|
||||
"""Handle download for the Document Query Service instance."""
|
||||
|
||||
@@ -3,17 +3,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.retrieval import RetrievalQuery, Retriever, RetrievedChunk
|
||||
from app.domain.retrieval.ports import Reranker
|
||||
# Keep orchestration logic centralized so use-case flow stays easy to trace.
|
||||
|
||||
|
||||
|
||||
class KnowledgeRetrievalService:
|
||||
"""Provide the Knowledge Retrieval Service service."""
|
||||
def __init__(self, *, retriever: Retriever) -> None:
|
||||
|
||||
def __init__(self, *, retriever: Retriever, reranker: Reranker | None = None, reranker_top_k: int = 5) -> None:
|
||||
"""Initialize the Knowledge Retrieval Service instance."""
|
||||
self.retriever = retriever
|
||||
self.reranker = reranker
|
||||
self.reranker_top_k = reranker_top_k
|
||||
|
||||
def retrieve(self, *, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
|
||||
"""Handle retrieve for the Knowledge Retrieval Service instance."""
|
||||
retrieval_query = RetrievalQuery(query=query, top_k=top_k, filters=filters)
|
||||
return self.retriever.retrieve(retrieval_query)
|
||||
"""Retrieve and optionally rerank chunks for a query."""
|
||||
candidate_k = top_k if self.reranker is None else max(top_k * 4, 20)
|
||||
retrieval_query = RetrievalQuery(query=query, top_k=candidate_k, filters=filters)
|
||||
candidates = self.retriever.retrieve(retrieval_query)
|
||||
if self.reranker and candidates:
|
||||
return self.reranker.rerank(query, candidates, top_k=self.reranker_top_k)
|
||||
return candidates[:top_k]
|
||||
|
||||
Reference in New Issue
Block a user