fix 文档管理模块 & 法规对话模块

2026-05-20 23:34:08 +08:00
parent c22b03dc07
commit b065d55c86
39 changed files with 1671 additions and 540 deletions
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -17,6 +17,7 @@ from app.domain.documents import (
    DocumentParser,
    DocumentRepository,
    DocumentStatus,
+    ParseArtifactStore,
    ParsedDocument,
 )
 from app.domain.retrieval import EmbeddingProvider, VectorIndex
@@ -47,6 +48,7 @@ class DocumentCommandService:
        chunk_builder: ChunkBuilder,
        embedding_provider: EmbeddingProvider,
        vector_index: VectorIndex,
+        parse_artifact_store: ParseArtifactStore | None = None,
    ) -> None:
        """Initialize the Document Command Service instance."""
        self.document_repository = document_repository
@@ -55,6 +57,7 @@ class DocumentCommandService:
        self.chunk_builder = chunk_builder
        self.embedding_provider = embedding_provider
        self.vector_index = vector_index
+        self.parse_artifact_store = parse_artifact_store

    def _save_parse_artifacts(self, *, doc_id: str, parsed_document: ParsedDocument) -> dict[str, str]:
        """Persist parse artifacts so troubleshooting does not depend on provider retention windows."""
@@ -143,6 +146,15 @@ class DocumentCommandService:
                    "processing_stage": "parsed",
                },
            )
+            if self.parse_artifact_store:
+                try:
+                    self.parse_artifact_store.save(
+                        doc_id,
+                        parsed_document.structure_nodes,
+                        parsed_document.semantic_blocks,
+                    )
+                except Exception:
+                    logger.warning("ParseArtifactStore.save failed for doc_id={}", doc_id)

            chunks = self.chunk_builder.build(
                parsed_document=parsed_document,
@@ -205,20 +217,120 @@ class DocumentCommandService:
                    logger.warning("临时文件清理失败: {}", temp_path)


+    def delete(self, doc_id: str) -> bool:
+        """Delete document record, binary file, and vector chunks."""
+        document = self.document_repository.get(doc_id)
+        if not document:
+            return False
+        try:
+            self.binary_store.delete(document.object_name)
+        except Exception:
+            logger.warning("Binary delete failed for doc_id={}", doc_id)
+        try:
+            self.vector_index.delete_by_document(doc_id)
+        except Exception:
+            logger.warning("Vector delete failed for doc_id={}", doc_id)
+        if self.parse_artifact_store:
+            try:
+                self.parse_artifact_store.delete(doc_id)
+            except Exception:
+                logger.warning("ParseArtifactStore delete failed for doc_id={}", doc_id)
+        self.document_repository.delete(doc_id)
+        return True
+
+    def retry(self, doc_id: str) -> DocumentProcessResult:
+        """Re-process a failed document from its stored binary."""
+        document = self.document_repository.get(doc_id)
+        if not document:
+            return DocumentProcessResult(doc_id=doc_id, doc_name="", status="failed", message="文档不存在")
+        content = self.binary_store.read(document.object_name)
+        return self.upload_and_process(
+            doc_id=doc_id,
+            file_name=document.file_name,
+            content=content,
+            content_type=document.content_type,
+            doc_name=document.doc_name,
+            regulation_type=document.regulation_type,
+            version=document.version,
+            generate_summary=bool(document.metadata.get("generate_summary", False)),
+        )
+
+
 class DocumentQueryService:
    """Provide the Document Query Service service."""
-    def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore) -> None:
+    def __init__(self, *, document_repository: DocumentRepository, binary_store: DocumentBinaryStore, vector_index: VectorIndex) -> None:
        """Initialize the Document Query Service instance."""
        self.document_repository = document_repository
        self.binary_store = binary_store
+        self.vector_index = vector_index

    def get(self, doc_id: str) -> Document | None:
        """Handle get for the Document Query Service instance."""
        return self.document_repository.get(doc_id)

    def list_documents(self, limit: int | None = None) -> list[Document]:
-        """List documents for the Document Query Service instance."""
-        return self.document_repository.list(limit=limit)
+        """Return documents with real-time state from Milvus as the authoritative source.
+
+        Algorithm:
+        1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
+        2. Load JSON/PG metadata records and index them by doc_id.
+        3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
+           metadata-only docs with status=INDEXED are demoted to FAILED.
+        4. Milvus-only docs (no metadata record) are surfaced as synthetic INDEXED
+           entries so they are never invisible to the management list.
+        """
+        # Fetch live Milvus state first.
+        try:
+            milvus_rows = self.vector_index.list_document_metadata()
+        except Exception:
+            milvus_rows = []
+
+        milvus_by_id: dict[str, dict] = {r["doc_id"]: r for r in milvus_rows}
+
+        # Load metadata store records.
+        meta_docs = self.document_repository.list(limit=limit)
+        meta_by_id: dict[str, Document] = {d.doc_id: d for d in meta_docs}
+
+        result: list[Document] = []
+
+        # Reconcile metadata records against Milvus.
+        for doc in meta_docs:
+            if doc.doc_id in milvus_by_id:
+                row = milvus_by_id[doc.doc_id]
+                doc.chunk_count = row["chunk_count"]
+                doc.status = DocumentStatus.INDEXED
+                # Backfill fields that may be missing from older JSON records.
+                if not doc.doc_name and row.get("doc_name"):
+                    doc.doc_name = row["doc_name"]
+                if not doc.regulation_type and row.get("regulation_type"):
+                    doc.regulation_type = row["regulation_type"]
+                if not doc.version and row.get("version"):
+                    doc.version = row["version"]
+            elif doc.status == DocumentStatus.INDEXED:
+                # Metadata says indexed but Milvus has no chunks.
+                doc.status = DocumentStatus.FAILED
+                doc.error_message = "向量数据库中未找到对应数据"
+            result.append(doc)
+
+        # Surface Milvus-only docs that have no metadata record at all.
+        for doc_id, row in milvus_by_id.items():
+            if doc_id not in meta_by_id:
+                synthetic = Document(
+                    doc_id=doc_id,
+                    doc_name=row.get("doc_name", doc_id),
+                    file_name=row.get("doc_name", doc_id),
+                    object_name="",
+                    content_type="",
+                    size_bytes=0,
+                    status=DocumentStatus.INDEXED,
+                    regulation_type=row.get("regulation_type", ""),
+                    version=row.get("version", ""),
+                    chunk_count=row["chunk_count"],
+                )
+                result.append(synthetic)
+
+        result.sort(key=lambda d: d.updated_at, reverse=True)
+        return result[:limit] if limit is not None else result

    def download(self, doc_id: str) -> tuple[Document, bytes]:
        """Handle download for the Document Query Service instance."""
--- a/backend/app/application/knowledge/services.py
+++ b/backend/app/application/knowledge/services.py
@@ -3,17 +3,24 @@
 from __future__ import annotations

 from app.domain.retrieval import RetrievalQuery, Retriever, RetrievedChunk
+from app.domain.retrieval.ports import Reranker
 # Keep orchestration logic centralized so use-case flow stays easy to trace.


-
 class KnowledgeRetrievalService:
    """Provide the Knowledge Retrieval Service service."""
-    def __init__(self, *, retriever: Retriever) -> None:
+
+    def __init__(self, *, retriever: Retriever, reranker: Reranker | None = None, reranker_top_k: int = 5) -> None:
        """Initialize the Knowledge Retrieval Service instance."""
        self.retriever = retriever
+        self.reranker = reranker
+        self.reranker_top_k = reranker_top_k

    def retrieve(self, *, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
-        """Handle retrieve for the Knowledge Retrieval Service instance."""
-        retrieval_query = RetrievalQuery(query=query, top_k=top_k, filters=filters)
-        return self.retriever.retrieve(retrieval_query)
+        """Retrieve and optionally rerank chunks for a query."""
+        candidate_k = top_k if self.reranker is None else max(top_k * 4, 20)
+        retrieval_query = RetrievalQuery(query=query, top_k=candidate_k, filters=filters)
+        candidates = self.retriever.retrieve(retrieval_query)
+        if self.reranker and candidates:
+            return self.reranker.rerank(query, candidates, top_k=self.reranker_top_k)
+        return candidates[:top_k]