fix 文档管理模块 & 法规对话模块

This commit is contained in:
2026-05-20 23:34:08 +08:00
parent c22b03dc07
commit b065d55c86
39 changed files with 1671 additions and 540 deletions

View File

@@ -17,6 +17,7 @@ from app.domain.documents import (
DocumentParser,
DocumentRepository,
DocumentStatus,
ParseArtifactStore,
ParsedDocument,
)
from app.domain.retrieval import EmbeddingProvider, VectorIndex
@@ -47,6 +48,7 @@ class DocumentCommandService:
chunk_builder: ChunkBuilder,
embedding_provider: EmbeddingProvider,
vector_index: VectorIndex,
parse_artifact_store: ParseArtifactStore | None = None,
) -> None:
"""Initialize the Document Command Service instance."""
self.document_repository = document_repository
@@ -55,6 +57,7 @@ class DocumentCommandService:
self.chunk_builder = chunk_builder
self.embedding_provider = embedding_provider
self.vector_index = vector_index
self.parse_artifact_store = parse_artifact_store
def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
"""Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
@@ -143,6 +146,15 @@ class DocumentCommandService:
"processing_stage": "parsed",
},
)
if self.parse_artifact_store:
try:
self.parse_artifact_store.save(
doc_id,
parsed_document.structure_nodes,
parsed_document.semantic_blocks,
)
except Exception:
logger.warning("ParseArtifactStore.save failed for doc_id={}", doc_id)
chunks = self.chunk_builder.build(
parsed_document=parsed_document,
@@ -205,20 +217,120 @@ class DocumentCommandService:
logger.warning("临时文件清理失败: {}", temp_path)
def delete(self, doc_id: str) -> bool:
"""Delete document record, binary file, and vector chunks."""
document = self.document_repository.get(doc_id)
if not document:
return False
try:
self.binary_store.delete(document.object_name)
except Exception:
logger.warning("Binary delete failed for doc_id={}", doc_id)
try:
self.vector_index.delete_by_document(doc_id)
except Exception:
logger.warning("Vector delete failed for doc_id={}", doc_id)
if self.parse_artifact_store:
try:
self.parse_artifact_store.delete(doc_id)
except Exception:
logger.warning("ParseArtifactStore delete failed for doc_id={}", doc_id)
self.document_repository.delete(doc_id)
return True
def retry(self, doc_id: str) -> DocumentProcessResult:
"""Re-process a failed document from its stored binary."""
document = self.document_repository.get(doc_id)
if not document:
return DocumentProcessResult(doc_id=doc_id, doc_name="", status="failed", message="文档不存在")
content = self.binary_store.read(document.object_name)
return self.upload_and_process(
doc_id=doc_id,
file_name=document.file_name,
content=content,
content_type=document.content_type,
doc_name=document.doc_name,
regulation_type=document.regulation_type,
version=document.version,
generate_summary=bool(document.metadata.get("generate_summary", False)),
)
class DocumentQueryService:
"""Provide the Document Query Service service."""
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:
def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore, vector_index: VectorIndex) -> None:
"""Initialize the Document Query Service instance."""
self.document_repository = document_repository
self.binary_store = binary_store
self.vector_index = vector_index
def get(self, doc_id: str) -> Document | None:
"""Handle get for the Document Query Service instance."""
return self.document_repository.get(doc_id)
def list_documents(self, limit: int | None = None) -> list[Document]:
"""List documents for the Document Query Service instance."""
return self.document_repository.list(limit=limit)
"""Return documents with real-time state from Milvus as the authoritative source.
Algorithm:
1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
2. Load JSON/PG metadata records and index them by doc_id.
3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
metadata-only docs with status=INDEXED are demoted to FAILED.
4. Milvus-only docs (no metadata record) are surfaced as synthetic INDEXED
entries so they are never invisible to the management list.
"""
# Fetch live Milvus state first.
try:
milvus_rows = self.vector_index.list_document_metadata()
except Exception:
milvus_rows = []
milvus_by_id: dict[str, dict] = {r["doc_id"]: r for r in milvus_rows}
# Load metadata store records.
meta_docs = self.document_repository.list(limit=limit)
meta_by_id: dict[str, Document] = {d.doc_id: d for d in meta_docs}
result: list[Document] = []
# Reconcile metadata records against Milvus.
for doc in meta_docs:
if doc.doc_id in milvus_by_id:
row = milvus_by_id[doc.doc_id]
doc.chunk_count = row["chunk_count"]
doc.status = DocumentStatus.INDEXED
# Backfill fields that may be missing from older JSON records.
if not doc.doc_name and row.get("doc_name"):
doc.doc_name = row["doc_name"]
if not doc.regulation_type and row.get("regulation_type"):
doc.regulation_type = row["regulation_type"]
if not doc.version and row.get("version"):
doc.version = row["version"]
elif doc.status == DocumentStatus.INDEXED:
# Metadata says indexed but Milvus has no chunks.
doc.status = DocumentStatus.FAILED
doc.error_message = "向量数据库中未找到对应数据"
result.append(doc)
# Surface Milvus-only docs that have no metadata record at all.
for doc_id, row in milvus_by_id.items():
if doc_id not in meta_by_id:
synthetic = Document(
doc_id=doc_id,
doc_name=row.get("doc_name", doc_id),
file_name=row.get("doc_name", doc_id),
object_name="",
content_type="",
size_bytes=0,
status=DocumentStatus.INDEXED,
regulation_type=row.get("regulation_type", ""),
version=row.get("version", ""),
chunk_count=row["chunk_count"],
)
result.append(synthetic)
result.sort(key=lambda d: d.updated_at, reverse=True)
return result[:limit] if limit is not None else result
def download(self, doc_id: str) -> tuple[Document, bytes]:
"""Handle download for the Document Query Service instance."""

View File

@@ -3,17 +3,24 @@
from __future__ import annotations
from app.domain.retrieval import RetrievalQuery, Retriever, RetrievedChunk
from app.domain.retrieval.ports import Reranker
# Keep orchestration logic centralized so use-case flow stays easy to trace.
class KnowledgeRetrievalService:
"""Provide the Knowledge Retrieval Service service."""
def __init__(self, *, retriever: Retriever) -> None:
def __init__(self, *, retriever: Retriever, reranker: Reranker | None = None, reranker_top_k: int = 5) -> None:
"""Initialize the Knowledge Retrieval Service instance."""
self.retriever = retriever
self.reranker = reranker
self.reranker_top_k = reranker_top_k
def retrieve(self, *, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
"""Handle retrieve for the Knowledge Retrieval Service instance."""
retrieval_query = RetrievalQuery(query=query, top_k=top_k, filters=filters)
return self.retriever.retrieve(retrieval_query)
"""Retrieve and optionally rerank chunks for a query."""
candidate_k = top_k if self.reranker is None else max(top_k * 4, 20)
retrieval_query = RetrievalQuery(query=query, top_k=candidate_k, filters=filters)
candidates = self.retriever.retrieve(retrieval_query)
if self.reranker and candidates:
return self.reranker.rerank(query, candidates, top_k=self.reranker_top_k)
return candidates[:top_k]