Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
@@ -122,16 +122,17 @@ class FakeChunkBuilder:
|
||||
Chunk(
|
||||
chunk_id=f"{parsed_document.doc_id}-chunk-1",
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content="法规正文",
|
||||
doc_title=parsed_document.doc_name,
|
||||
text="法规正文",
|
||||
embedding_text="标准:测试\n章节:第一章\n\n法规正文",
|
||||
section_title="第一章",
|
||||
section_path=["第一章"],
|
||||
page_number=1,
|
||||
page_start=1,
|
||||
page_end=1,
|
||||
chunk_type="section_text",
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id="semantic-1",
|
||||
block_type="section_text",
|
||||
metadata={"source": "aliyun_vector_chunk"},
|
||||
)
|
||||
]
|
||||
|
||||
@@ -18,11 +18,11 @@ class FakeRetriever:
|
||||
RetrievedChunk(
|
||||
chunk_id="chunk-1",
|
||||
doc_id="doc-1",
|
||||
doc_name="测试法规",
|
||||
content="法规正文",
|
||||
doc_title="测试法规",
|
||||
text="法规正文",
|
||||
score=0.91,
|
||||
section_title="第一章",
|
||||
page_number=1,
|
||||
page_start=1,
|
||||
metadata={"section_title": "第一章"},
|
||||
)
|
||||
]
|
||||
@@ -47,12 +47,12 @@ class FakeAnswerGenerator:
|
||||
sources=[
|
||||
AnswerSource(
|
||||
doc_id=item.doc_id,
|
||||
doc_name=item.doc_name,
|
||||
doc_title=item.doc_title,
|
||||
chunk_id=item.chunk_id,
|
||||
section_title=item.section_title,
|
||||
page_number=item.page_number,
|
||||
page_start=item.page_start,
|
||||
score=item.score,
|
||||
content=item.content,
|
||||
text=item.text,
|
||||
metadata=item.metadata,
|
||||
)
|
||||
for item in retrieved_chunks
|
||||
|
||||
117
tests/test_milvus_vector_index_runtime.py
Normal file
117
tests/test_milvus_vector_index_runtime.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""Test runtime recovery and API error serialization for the Milvus vector index."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from pymilvus import MilvusException
|
||||
|
||||
from app.api.models import ErrorResponse
|
||||
from app.infrastructure.vectorstore.milvus_vector_index import MilvusVectorIndex
|
||||
from app.shared.errors import VectorStoreSchemaError
|
||||
|
||||
|
||||
class FakeField:
|
||||
"""Represent a minimal Milvus schema field for tests."""
|
||||
|
||||
def __init__(self, name: str) -> None:
|
||||
"""Initialize the fake field."""
|
||||
self.name = name
|
||||
|
||||
|
||||
class FakeSchema:
|
||||
"""Represent a minimal Milvus schema container for tests."""
|
||||
|
||||
def __init__(self, field_names: list[str]) -> None:
|
||||
"""Initialize the fake schema from field names."""
|
||||
self.fields = [FakeField(name) for name in field_names]
|
||||
|
||||
|
||||
class FakeCollection:
|
||||
"""Represent a minimal collection object for runtime recovery tests."""
|
||||
|
||||
def __init__(self, field_names: list[str], responses: list[object]) -> None:
|
||||
"""Initialize the fake collection with schema fields and queued responses."""
|
||||
self.schema = FakeSchema(field_names)
|
||||
self.responses = responses
|
||||
self.num_entities = 0
|
||||
self.search_calls = 0
|
||||
|
||||
def search(self, **kwargs):
|
||||
"""Return the next queued response or raise the next queued exception."""
|
||||
self.search_calls += 1
|
||||
response = self.responses.pop(0)
|
||||
if isinstance(response, Exception):
|
||||
raise response
|
||||
return response
|
||||
|
||||
|
||||
def _build_index_for_test(*, collection: FakeCollection) -> MilvusVectorIndex:
|
||||
"""Create a MilvusVectorIndex instance without opening a real Milvus connection."""
|
||||
index = MilvusVectorIndex.__new__(MilvusVectorIndex)
|
||||
index.collection_name = "regulations_dense_1024_v2"
|
||||
index.db_name = "default"
|
||||
index.host = "6.86.80.8"
|
||||
index.port = 19530
|
||||
index.alias = "vector-index::test"
|
||||
index.collection = collection
|
||||
return index
|
||||
|
||||
|
||||
def test_search_rebinds_and_retries_after_stale_schema_error(monkeypatch):
|
||||
"""Refresh the bound collection once when Milvus reports a stale schema field."""
|
||||
schema_fields = [
|
||||
"id",
|
||||
"doc_id",
|
||||
"doc_title",
|
||||
"chunk_id",
|
||||
"text",
|
||||
"embedding",
|
||||
"section_title",
|
||||
"metadata_json",
|
||||
]
|
||||
stale_collection = FakeCollection(
|
||||
schema_fields,
|
||||
[MilvusException(code=65535, message="field doc_title not exist")],
|
||||
)
|
||||
refreshed_collection = FakeCollection(schema_fields, [[]])
|
||||
index = _build_index_for_test(collection=stale_collection)
|
||||
|
||||
def fake_bind_collection(*, force_refresh: bool = False):
|
||||
"""Return the refreshed collection on forced rebinding."""
|
||||
assert force_refresh is True
|
||||
return refreshed_collection
|
||||
|
||||
monkeypatch.setattr(index, "_bind_collection", fake_bind_collection)
|
||||
|
||||
results = index.search([0.0] * 1024, 1)
|
||||
|
||||
assert results == []
|
||||
assert stale_collection.search_calls == 1
|
||||
assert refreshed_collection.search_calls == 1
|
||||
assert index.collection is refreshed_collection
|
||||
|
||||
|
||||
def test_validate_schema_raises_detailed_vector_store_schema_error():
|
||||
"""Raise a typed schema error when required Milvus fields are missing."""
|
||||
invalid_collection = FakeCollection(
|
||||
["id", "doc_id", "doc_name", "content", "dense_vector"],
|
||||
[[]],
|
||||
)
|
||||
index = _build_index_for_test(collection=invalid_collection)
|
||||
|
||||
try:
|
||||
index._validate_schema(invalid_collection)
|
||||
except VectorStoreSchemaError as exc:
|
||||
assert "doc_title" in str(exc)
|
||||
assert "actual_fields=['id', 'doc_id', 'doc_name', 'content', 'dense_vector']" in str(exc)
|
||||
else:
|
||||
raise AssertionError("VectorStoreSchemaError was not raised")
|
||||
|
||||
|
||||
def test_error_response_is_json_serializable():
|
||||
"""Ensure shared API error responses encode datetime fields safely."""
|
||||
payload = jsonable_encoder(ErrorResponse(error="InternalServerError", message="boom"))
|
||||
|
||||
assert payload["error"] == "InternalServerError"
|
||||
assert payload["message"] == "boom"
|
||||
assert isinstance(payload["timestamp"], str)
|
||||
@@ -113,12 +113,12 @@ class FakeAgentConversationService:
|
||||
sources=[
|
||||
AnswerSource(
|
||||
doc_id="doc-api-1",
|
||||
doc_name="测试法规",
|
||||
doc_title="测试法规",
|
||||
chunk_id="chunk-1",
|
||||
section_title="第一章",
|
||||
page_number=1,
|
||||
page_start=1,
|
||||
score=0.92,
|
||||
content="法规原文",
|
||||
text="法规原文",
|
||||
metadata={"section_title": "第一章"},
|
||||
)
|
||||
],
|
||||
@@ -218,7 +218,6 @@ def test_agent_ask_and_stream_contract_preserved(monkeypatch):
|
||||
|
||||
store = FakeConversationStore()
|
||||
monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
|
||||
monkeypatch.setattr(agent, "get_conversation_store", lambda: store)
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ def verify_migration_config() -> bool:
|
||||
try:
|
||||
assert settings.embedding_model == "text-embedding-v3"
|
||||
assert settings.embedding_dim == 1024
|
||||
assert settings.milvus_collection == "regulations_dense_1024_v1"
|
||||
assert settings.milvus_collection == "regulations_dense_1024_v2"
|
||||
assert settings.parser_backend == "aliyun"
|
||||
assert settings.chunk_backend == "aliyun"
|
||||
logger.info(f"embedding_model={settings.embedding_model}")
|
||||
|
||||
Reference in New Issue
Block a user