Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`.
- Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`.
- Updated architecture documentation to reflect changes in the Milvus collection name.
- Adjusted requirements by removing the sqlalchemy dependency.
- Modified test cases to align with new document structure and naming conventions.
- Introduced a new test file for Milvus vector index runtime recovery and error handling.
- Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
ash66
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions

View File

@@ -122,16 +122,17 @@ class FakeChunkBuilder:
Chunk(
chunk_id=f"{parsed_document.doc_id}-chunk-1",
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content="法规正文",
doc_title=parsed_document.doc_name,
text="法规正文",
embedding_text="标准:测试\n章节:第一章\n\n法规正文",
section_title="第一章",
section_path=["第一章"],
page_number=1,
page_start=1,
page_end=1,
chunk_type="section_text",
regulation_type=regulation_type,
version=version,
semantic_id="semantic-1",
block_type="section_text",
metadata={"source": "aliyun_vector_chunk"},
)
]

View File

@@ -18,11 +18,11 @@ class FakeRetriever:
RetrievedChunk(
chunk_id="chunk-1",
doc_id="doc-1",
doc_name="测试法规",
content="法规正文",
doc_title="测试法规",
text="法规正文",
score=0.91,
section_title="第一章",
page_number=1,
page_start=1,
metadata={"section_title": "第一章"},
)
]
@@ -47,12 +47,12 @@ class FakeAnswerGenerator:
sources=[
AnswerSource(
doc_id=item.doc_id,
doc_name=item.doc_name,
doc_title=item.doc_title,
chunk_id=item.chunk_id,
section_title=item.section_title,
page_number=item.page_number,
page_start=item.page_start,
score=item.score,
content=item.content,
text=item.text,
metadata=item.metadata,
)
for item in retrieved_chunks

View File

@@ -0,0 +1,117 @@
"""Test runtime recovery and API error serialization for the Milvus vector index."""
from __future__ import annotations
from fastapi.encoders import jsonable_encoder
from pymilvus import MilvusException
from app.api.models import ErrorResponse
from app.infrastructure.vectorstore.milvus_vector_index import MilvusVectorIndex
from app.shared.errors import VectorStoreSchemaError
class FakeField:
"""Represent a minimal Milvus schema field for tests."""
def __init__(self, name: str) -> None:
"""Initialize the fake field."""
self.name = name
class FakeSchema:
"""Represent a minimal Milvus schema container for tests."""
def __init__(self, field_names: list[str]) -> None:
"""Initialize the fake schema from field names."""
self.fields = [FakeField(name) for name in field_names]
class FakeCollection:
"""Represent a minimal collection object for runtime recovery tests."""
def __init__(self, field_names: list[str], responses: list[object]) -> None:
"""Initialize the fake collection with schema fields and queued responses."""
self.schema = FakeSchema(field_names)
self.responses = responses
self.num_entities = 0
self.search_calls = 0
def search(self, **kwargs):
"""Return the next queued response or raise the next queued exception."""
self.search_calls += 1
response = self.responses.pop(0)
if isinstance(response, Exception):
raise response
return response
def _build_index_for_test(*, collection: FakeCollection) -> MilvusVectorIndex:
"""Create a MilvusVectorIndex instance without opening a real Milvus connection."""
index = MilvusVectorIndex.__new__(MilvusVectorIndex)
index.collection_name = "regulations_dense_1024_v2"
index.db_name = "default"
index.host = "6.86.80.8"
index.port = 19530
index.alias = "vector-index::test"
index.collection = collection
return index
def test_search_rebinds_and_retries_after_stale_schema_error(monkeypatch):
"""Refresh the bound collection once when Milvus reports a stale schema field."""
schema_fields = [
"id",
"doc_id",
"doc_title",
"chunk_id",
"text",
"embedding",
"section_title",
"metadata_json",
]
stale_collection = FakeCollection(
schema_fields,
[MilvusException(code=65535, message="field doc_title not exist")],
)
refreshed_collection = FakeCollection(schema_fields, [[]])
index = _build_index_for_test(collection=stale_collection)
def fake_bind_collection(*, force_refresh: bool = False):
"""Return the refreshed collection on forced rebinding."""
assert force_refresh is True
return refreshed_collection
monkeypatch.setattr(index, "_bind_collection", fake_bind_collection)
results = index.search([0.0] * 1024, 1)
assert results == []
assert stale_collection.search_calls == 1
assert refreshed_collection.search_calls == 1
assert index.collection is refreshed_collection
def test_validate_schema_raises_detailed_vector_store_schema_error():
"""Raise a typed schema error when required Milvus fields are missing."""
invalid_collection = FakeCollection(
["id", "doc_id", "doc_name", "content", "dense_vector"],
[[]],
)
index = _build_index_for_test(collection=invalid_collection)
try:
index._validate_schema(invalid_collection)
except VectorStoreSchemaError as exc:
assert "doc_title" in str(exc)
assert "actual_fields=['id', 'doc_id', 'doc_name', 'content', 'dense_vector']" in str(exc)
else:
raise AssertionError("VectorStoreSchemaError was not raised")
def test_error_response_is_json_serializable():
"""Ensure shared API error responses encode datetime fields safely."""
payload = jsonable_encoder(ErrorResponse(error="InternalServerError", message="boom"))
assert payload["error"] == "InternalServerError"
assert payload["message"] == "boom"
assert isinstance(payload["timestamp"], str)

View File

@@ -113,12 +113,12 @@ class FakeAgentConversationService:
sources=[
AnswerSource(
doc_id="doc-api-1",
doc_name="测试法规",
doc_title="测试法规",
chunk_id="chunk-1",
section_title="第一章",
page_number=1,
page_start=1,
score=0.92,
content="法规原文",
text="法规原文",
metadata={"section_title": "第一章"},
)
],
@@ -218,7 +218,6 @@ def test_agent_ask_and_stream_contract_preserved(monkeypatch):
store = FakeConversationStore()
monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
monkeypatch.setattr(agent, "get_conversation_store", lambda: store)
client = TestClient(app)

View File

@@ -65,7 +65,7 @@ def verify_migration_config() -> bool:
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1024
assert settings.milvus_collection == "regulations_dense_1024_v1"
assert settings.milvus_collection == "regulations_dense_1024_v2"
assert settings.parser_backend == "aliyun"
assert settings.chunk_backend == "aliyun"
logger.info(f"embedding_model={settings.embedding_model}")