Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions
--- a/backend/app/infrastructure/vectorstore/bm25_retriever.py
+++ b/backend/app/infrastructure/vectorstore/bm25_retriever.py
@@ -56,7 +56,21 @@ class BM25Retriever:
        try:
            rows = self._vector_index.collection.query(
                expr='doc_id != ""',
-                output_fields=["id", "doc_id", "doc_name", "content", "section_title", "page_number"],
+                output_fields=[
+                    "id",
+                    "chunk_id",
+                    "doc_id",
+                    "doc_title",
+                    "text",
+                    "chunk_type",
+                    "section_title",
+                    "page_start",
+                    "page_end",
+                    "section_level",
+                    "chunk_index",
+                    "piece_index",
+                    "metadata_json",
+                ],
                limit=16384,
            )
        except Exception:
@@ -64,19 +78,33 @@ class BM25Retriever:
            return []
        return [
            RetrievedChunk(
-                chunk_id=str(row.get("id", "")),
+                chunk_id=str(row.get("chunk_id") or row.get("id", "")),
                doc_id=str(row.get("doc_id", "")),
-                doc_name=str(row.get("doc_name", "")),
-                content=str(row.get("content", "")),
+                doc_title=str(row.get("doc_title", "")),
+                text=str(row.get("text", "")),
                score=0.0,
+                chunk_type=str(row.get("chunk_type", "")),
                section_title=str(row.get("section_title", "")),
-                page_number=int(row.get("page_number") or 0),
-                metadata={},
+                page_start=int(row.get("page_start") or 0),
+                page_end=int(row.get("page_end") or 0),
+                section_level=int(row.get("section_level") or 0),
+                chunk_index=int(row.get("chunk_index") or 0),
+                piece_index=int(row.get("piece_index") or 0),
+                metadata=self._parse_metadata_json(row.get("metadata_json", "")),
            )
            for row in rows
-            if row.get("content")
+            if row.get("text")
        ]

+    def _parse_metadata_json(self, raw_metadata: str) -> dict:
+        """Parse metadata_json into a dict for BM25-side filtering."""
+        if not raw_metadata:
+            return {}
+        try:
+            return dict(__import__("json").loads(raw_metadata))
+        except Exception:
+            return {}
+
    def _ensure_built(self) -> None:
        if self._index is not None:
            return
@@ -93,7 +121,7 @@ class BM25Retriever:
            self._chunks = []
            self._index = BM25Okapi([[]])
            return
-        tokenized = [_tokenize(c.content) for c in chunks]
+        tokenized = [_tokenize(c.text) for c in chunks]
        self._chunks = chunks
        self._index = BM25Okapi(tokenized)
        logger.info("BM25Retriever: index built with %d chunks", len(chunks))
@@ -127,20 +155,26 @@ class BM25Retriever:
        for score, chunk in ranked[: top_k * 2]:
            if score <= 0:
                break
-            # Apply simple regulation_type filter if provided
-            if filters and chunk.metadata.get("regulation_type"):
-                types = [t.strip() for t in filters.split(",")]
-                if chunk.metadata.get("regulation_type") not in types:
-                    continue
+            if filters:
+                normalized_filter = filters.replace("doc_name", "doc_title").strip()
+                if normalized_filter.startswith('doc_title == "'):
+                    expected_title = normalized_filter[len('doc_title == "'):-1]
+                    if chunk.doc_title != expected_title:
+                        continue
            results.append(
                RetrievedChunk(
                    chunk_id=chunk.chunk_id,
                    doc_id=chunk.doc_id,
-                    doc_name=chunk.doc_name,
-                    content=chunk.content,
+                    doc_title=chunk.doc_title,
+                    text=chunk.text,
                    score=score,
+                    chunk_type=chunk.chunk_type,
                    section_title=chunk.section_title,
-                    page_number=chunk.page_number,
+                    page_start=chunk.page_start,
+                    page_end=chunk.page_end,
+                    section_level=chunk.section_level,
+                    chunk_index=chunk.chunk_index,
+                    piece_index=chunk.piece_index,
                    metadata=chunk.metadata,
                )
            )