Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`.
- Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`.
- Updated architecture documentation to reflect changes in the Milvus collection name.
- Adjusted requirements by removing the sqlalchemy dependency.
- Modified test cases to align with new document structure and naming conventions.
- Introduced a new test file for Milvus vector index runtime recovery and error handling.
- Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
ash66
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions

View File

@@ -56,7 +56,21 @@ class BM25Retriever:
try:
rows = self._vector_index.collection.query(
expr='doc_id != ""',
output_fields=["id", "doc_id", "doc_name", "content", "section_title", "page_number"],
output_fields=[
"id",
"chunk_id",
"doc_id",
"doc_title",
"text",
"chunk_type",
"section_title",
"page_start",
"page_end",
"section_level",
"chunk_index",
"piece_index",
"metadata_json",
],
limit=16384,
)
except Exception:
@@ -64,19 +78,33 @@ class BM25Retriever:
return []
return [
RetrievedChunk(
chunk_id=str(row.get("id", "")),
chunk_id=str(row.get("chunk_id") or row.get("id", "")),
doc_id=str(row.get("doc_id", "")),
doc_name=str(row.get("doc_name", "")),
content=str(row.get("content", "")),
doc_title=str(row.get("doc_title", "")),
text=str(row.get("text", "")),
score=0.0,
chunk_type=str(row.get("chunk_type", "")),
section_title=str(row.get("section_title", "")),
page_number=int(row.get("page_number") or 0),
metadata={},
page_start=int(row.get("page_start") or 0),
page_end=int(row.get("page_end") or 0),
section_level=int(row.get("section_level") or 0),
chunk_index=int(row.get("chunk_index") or 0),
piece_index=int(row.get("piece_index") or 0),
metadata=self._parse_metadata_json(row.get("metadata_json", "")),
)
for row in rows
if row.get("content")
if row.get("text")
]
def _parse_metadata_json(self, raw_metadata: str) -> dict:
"""Parse metadata_json into a dict for BM25-side filtering."""
if not raw_metadata:
return {}
try:
return dict(__import__("json").loads(raw_metadata))
except Exception:
return {}
def _ensure_built(self) -> None:
if self._index is not None:
return
@@ -93,7 +121,7 @@ class BM25Retriever:
self._chunks = []
self._index = BM25Okapi([[]])
return
tokenized = [_tokenize(c.content) for c in chunks]
tokenized = [_tokenize(c.text) for c in chunks]
self._chunks = chunks
self._index = BM25Okapi(tokenized)
logger.info("BM25Retriever: index built with %d chunks", len(chunks))
@@ -127,20 +155,26 @@ class BM25Retriever:
for score, chunk in ranked[: top_k * 2]:
if score <= 0:
break
# Apply simple regulation_type filter if provided
if filters and chunk.metadata.get("regulation_type"):
types = [t.strip() for t in filters.split(",")]
if chunk.metadata.get("regulation_type") not in types:
continue
if filters:
normalized_filter = filters.replace("doc_name", "doc_title").strip()
if normalized_filter.startswith('doc_title == "'):
expected_title = normalized_filter[len('doc_title == "'):-1]
if chunk.doc_title != expected_title:
continue
results.append(
RetrievedChunk(
chunk_id=chunk.chunk_id,
doc_id=chunk.doc_id,
doc_name=chunk.doc_name,
content=chunk.content,
doc_title=chunk.doc_title,
text=chunk.text,
score=score,
chunk_type=chunk.chunk_type,
section_title=chunk.section_title,
page_number=chunk.page_number,
page_start=chunk.page_start,
page_end=chunk.page_end,
section_level=chunk.section_level,
chunk_index=chunk.chunk_index,
piece_index=chunk.piece_index,
metadata=chunk.metadata,
)
)