Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions
--- a/tests/test_embedding.py
+++ b/tests/test_embedding.py
@@ -122,16 +122,17 @@ class FakeChunkBuilder:
            Chunk(
                chunk_id=f"{parsed_document.doc_id}-chunk-1",
                doc_id=parsed_document.doc_id,
-                doc_name=parsed_document.doc_name,
-                content="法规正文",
+                doc_title=parsed_document.doc_name,
+                text="法规正文",
                embedding_text="标准：测试\n章节：第一章\n\n法规正文",
                section_title="第一章",
                section_path=["第一章"],
-                page_number=1,
+                page_start=1,
+                page_end=1,
+                chunk_type="section_text",
                regulation_type=regulation_type,
                version=version,
                semantic_id="semantic-1",
-                block_type="section_text",
                metadata={"source": "aliyun_vector_chunk"},
            )
        ]
--- a/tests/test_milvus.py
+++ b/tests/test_milvus.py
@@ -18,11 +18,11 @@ class FakeRetriever:
            RetrievedChunk(
                chunk_id="chunk-1",
                doc_id="doc-1",
-                doc_name="测试法规",
-                content="法规正文",
+                doc_title="测试法规",
+                text="法规正文",
                score=0.91,
                section_title="第一章",
-                page_number=1,
+                page_start=1,
                metadata={"section_title": "第一章"},
            )
        ]
@@ -47,12 +47,12 @@ class FakeAnswerGenerator:
            sources=[
                AnswerSource(
                    doc_id=item.doc_id,
-                    doc_name=item.doc_name,
+                    doc_title=item.doc_title,
                    chunk_id=item.chunk_id,
                    section_title=item.section_title,
-                    page_number=item.page_number,
+                    page_start=item.page_start,
                    score=item.score,
-                    content=item.content,
+                    text=item.text,
                    metadata=item.metadata,
                )
                for item in retrieved_chunks
--- a/tests/test_milvus_vector_index_runtime.py
+++ b/tests/test_milvus_vector_index_runtime.py
@@ -0,0 +1,117 @@
+"""Test runtime recovery and API error serialization for the Milvus vector index."""
+
+from __future__ import annotations
+
+from fastapi.encoders import jsonable_encoder
+from pymilvus import MilvusException
+
+from app.api.models import ErrorResponse
+from app.infrastructure.vectorstore.milvus_vector_index import MilvusVectorIndex
+from app.shared.errors import VectorStoreSchemaError
+
+
+class FakeField:
+    """Represent a minimal Milvus schema field for tests."""
+
+    def __init__(self, name: str) -> None:
+        """Initialize the fake field."""
+        self.name = name
+
+
+class FakeSchema:
+    """Represent a minimal Milvus schema container for tests."""
+
+    def __init__(self, field_names: list[str]) -> None:
+        """Initialize the fake schema from field names."""
+        self.fields = [FakeField(name) for name in field_names]
+
+
+class FakeCollection:
+    """Represent a minimal collection object for runtime recovery tests."""
+
+    def __init__(self, field_names: list[str], responses: list[object]) -> None:
+        """Initialize the fake collection with schema fields and queued responses."""
+        self.schema = FakeSchema(field_names)
+        self.responses = responses
+        self.num_entities = 0
+        self.search_calls = 0
+
+    def search(self, **kwargs):
+        """Return the next queued response or raise the next queued exception."""
+        self.search_calls += 1
+        response = self.responses.pop(0)
+        if isinstance(response, Exception):
+            raise response
+        return response
+
+
+def _build_index_for_test(*, collection: FakeCollection) -> MilvusVectorIndex:
+    """Create a MilvusVectorIndex instance without opening a real Milvus connection."""
+    index = MilvusVectorIndex.__new__(MilvusVectorIndex)
+    index.collection_name = "regulations_dense_1024_v2"
+    index.db_name = "default"
+    index.host = "6.86.80.8"
+    index.port = 19530
+    index.alias = "vector-index::test"
+    index.collection = collection
+    return index
+
+
+def test_search_rebinds_and_retries_after_stale_schema_error(monkeypatch):
+    """Refresh the bound collection once when Milvus reports a stale schema field."""
+    schema_fields = [
+        "id",
+        "doc_id",
+        "doc_title",
+        "chunk_id",
+        "text",
+        "embedding",
+        "section_title",
+        "metadata_json",
+    ]
+    stale_collection = FakeCollection(
+        schema_fields,
+        [MilvusException(code=65535, message="field doc_title not exist")],
+    )
+    refreshed_collection = FakeCollection(schema_fields, [[]])
+    index = _build_index_for_test(collection=stale_collection)
+
+    def fake_bind_collection(*, force_refresh: bool = False):
+        """Return the refreshed collection on forced rebinding."""
+        assert force_refresh is True
+        return refreshed_collection
+
+    monkeypatch.setattr(index, "_bind_collection", fake_bind_collection)
+
+    results = index.search([0.0] * 1024, 1)
+
+    assert results == []
+    assert stale_collection.search_calls == 1
+    assert refreshed_collection.search_calls == 1
+    assert index.collection is refreshed_collection
+
+
+def test_validate_schema_raises_detailed_vector_store_schema_error():
+    """Raise a typed schema error when required Milvus fields are missing."""
+    invalid_collection = FakeCollection(
+        ["id", "doc_id", "doc_name", "content", "dense_vector"],
+        [[]],
+    )
+    index = _build_index_for_test(collection=invalid_collection)
+
+    try:
+        index._validate_schema(invalid_collection)
+    except VectorStoreSchemaError as exc:
+        assert "doc_title" in str(exc)
+        assert "actual_fields=['id', 'doc_id', 'doc_name', 'content', 'dense_vector']" in str(exc)
+    else:
+        raise AssertionError("VectorStoreSchemaError was not raised")
+
+
+def test_error_response_is_json_serializable():
+    """Ensure shared API error responses encode datetime fields safely."""
+    payload = jsonable_encoder(ErrorResponse(error="InternalServerError", message="boom"))
+
+    assert payload["error"] == "InternalServerError"
+    assert payload["message"] == "boom"
+    assert isinstance(payload["timestamp"], str)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -113,12 +113,12 @@ class FakeAgentConversationService:
            sources=[
                AnswerSource(
                    doc_id="doc-api-1",
-                    doc_name="测试法规",
+                    doc_title="测试法规",
                    chunk_id="chunk-1",
                    section_title="第一章",
-                    page_number=1,
+                    page_start=1,
                    score=0.92,
-                    content="法规原文",
+                    text="法规原文",
                    metadata={"section_title": "第一章"},
                )
            ],
@@ -218,7 +218,6 @@ def test_agent_ask_and_stream_contract_preserved(monkeypatch):

    store = FakeConversationStore()
    monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
-    monkeypatch.setattr(agent, "get_conversation_store", lambda: store)

    client = TestClient(app)

--- a/tests/verify_mvp.py
+++ b/tests/verify_mvp.py
@@ -65,7 +65,7 @@ def verify_migration_config() -> bool:
    try:
        assert settings.embedding_model == "text-embedding-v3"
        assert settings.embedding_dim == 1024
-        assert settings.milvus_collection == "regulations_dense_1024_v1"
+        assert settings.milvus_collection == "regulations_dense_1024_v2"
        assert settings.parser_backend == "aliyun"
        assert settings.chunk_backend == "aliyun"
        logger.info(f"embedding_model={settings.embedding_model}")