Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions
--- a/backend/app/application/documents/services.py
+++ b/backend/app/application/documents/services.py
@@ -508,7 +508,7 @@ class DocumentQueryService:
        """Return documents with real-time state from Milvus as the authoritative source.

        Algorithm:
-        1. Query Milvus for all doc metadata (doc_id, doc_name, chunk_count, …).
+        1. Query Milvus for all doc metadata (doc_id, doc_title, chunk_count, …).
        2. Load JSON/PG metadata records and index them by doc_id.
        3. Merge: Milvus-present docs get status=INDEXED and live chunk_count;
           metadata-only docs with status=INDEXED are demoted to FAILED.
@@ -536,8 +536,8 @@ class DocumentQueryService:
                doc.chunk_count = row["chunk_count"]
                doc.status = DocumentStatus.INDEXED
                # Backfill fields that may be missing from older JSON records.
-                if not doc.doc_name and row.get("doc_name"):
-                    doc.doc_name = row["doc_name"]
+                if not doc.doc_name and row.get("doc_title"):
+                    doc.doc_name = row["doc_title"]
                if not doc.regulation_type and row.get("regulation_type"):
                    doc.regulation_type = row["regulation_type"]
                if not doc.version and row.get("version"):
@@ -553,8 +553,8 @@ class DocumentQueryService:
            if doc_id not in meta_by_id:
                synthetic = Document(
                    doc_id=doc_id,
-                    doc_name=row.get("doc_name", doc_id),
-                    file_name=row.get("doc_name", doc_id),
+                    doc_name=row.get("doc_title", doc_id),
+                    file_name=row.get("doc_title", doc_id),
                    object_name="",
                    content_type="",
                    size_bytes=0,
--- a/backend/app/application/knowledge/services.py
+++ b/backend/app/application/knowledge/services.py
@@ -29,11 +29,16 @@ def _reciprocal_rank_fusion(
        RetrievedChunk(
            chunk_id=chunk_map[ck].chunk_id,
            doc_id=chunk_map[ck].doc_id,
-            doc_name=chunk_map[ck].doc_name,
-            content=chunk_map[ck].content,
+            doc_title=chunk_map[ck].doc_title,
+            text=chunk_map[ck].text,
            score=scores[ck],
+            chunk_type=chunk_map[ck].chunk_type,
            section_title=chunk_map[ck].section_title,
-            page_number=chunk_map[ck].page_number,
+            page_start=chunk_map[ck].page_start,
+            page_end=chunk_map[ck].page_end,
+            section_level=chunk_map[ck].section_level,
+            chunk_index=chunk_map[ck].chunk_index,
+            piece_index=chunk_map[ck].piece_index,
            metadata=chunk_map[ck].metadata,
        )
        for ck in sorted_keys
--- a/backend/app/application/perception/services.py
+++ b/backend/app/application/perception/services.py
@@ -71,9 +71,9 @@ class PerceptionService:
                    affected_docs.append(
                        {
                            "doc_id": chunk.doc_id,
-                            "doc_name": chunk.doc_name,
+                            "doc_title": chunk.doc_title,
                            "score": round(float(chunk.score), 4),
-                            "snippet": (chunk.content or "")[:180],
+                            "snippet": (chunk.text or "")[:180],
                            "clause": getattr(chunk, "section_title", "") or "",
                        }
                    )
@@ -84,7 +84,7 @@ class PerceptionService:

        # --- 2. Build context from retrieved chunks ---
        context_parts = [
-            f"[文档{i}: {c.doc_name}]\n{(c.content or '')[:400]}"
+            f"[文档{i}: {c.doc_title}]\n{(c.text or '')[:400]}"
            for i, c in enumerate(chunks[:5], 1)
        ]
        context = "\n\n".join(context_parts) if context_parts else "（知识库中暂无相关文档）"