Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
@@ -56,7 +56,21 @@ class BM25Retriever:
|
||||
try:
|
||||
rows = self._vector_index.collection.query(
|
||||
expr='doc_id != ""',
|
||||
output_fields=["id", "doc_id", "doc_name", "content", "section_title", "page_number"],
|
||||
output_fields=[
|
||||
"id",
|
||||
"chunk_id",
|
||||
"doc_id",
|
||||
"doc_title",
|
||||
"text",
|
||||
"chunk_type",
|
||||
"section_title",
|
||||
"page_start",
|
||||
"page_end",
|
||||
"section_level",
|
||||
"chunk_index",
|
||||
"piece_index",
|
||||
"metadata_json",
|
||||
],
|
||||
limit=16384,
|
||||
)
|
||||
except Exception:
|
||||
@@ -64,19 +78,33 @@ class BM25Retriever:
|
||||
return []
|
||||
return [
|
||||
RetrievedChunk(
|
||||
chunk_id=str(row.get("id", "")),
|
||||
chunk_id=str(row.get("chunk_id") or row.get("id", "")),
|
||||
doc_id=str(row.get("doc_id", "")),
|
||||
doc_name=str(row.get("doc_name", "")),
|
||||
content=str(row.get("content", "")),
|
||||
doc_title=str(row.get("doc_title", "")),
|
||||
text=str(row.get("text", "")),
|
||||
score=0.0,
|
||||
chunk_type=str(row.get("chunk_type", "")),
|
||||
section_title=str(row.get("section_title", "")),
|
||||
page_number=int(row.get("page_number") or 0),
|
||||
metadata={},
|
||||
page_start=int(row.get("page_start") or 0),
|
||||
page_end=int(row.get("page_end") or 0),
|
||||
section_level=int(row.get("section_level") or 0),
|
||||
chunk_index=int(row.get("chunk_index") or 0),
|
||||
piece_index=int(row.get("piece_index") or 0),
|
||||
metadata=self._parse_metadata_json(row.get("metadata_json", "")),
|
||||
)
|
||||
for row in rows
|
||||
if row.get("content")
|
||||
if row.get("text")
|
||||
]
|
||||
|
||||
def _parse_metadata_json(self, raw_metadata: str) -> dict:
|
||||
"""Parse metadata_json into a dict for BM25-side filtering."""
|
||||
if not raw_metadata:
|
||||
return {}
|
||||
try:
|
||||
return dict(__import__("json").loads(raw_metadata))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _ensure_built(self) -> None:
|
||||
if self._index is not None:
|
||||
return
|
||||
@@ -93,7 +121,7 @@ class BM25Retriever:
|
||||
self._chunks = []
|
||||
self._index = BM25Okapi([[]])
|
||||
return
|
||||
tokenized = [_tokenize(c.content) for c in chunks]
|
||||
tokenized = [_tokenize(c.text) for c in chunks]
|
||||
self._chunks = chunks
|
||||
self._index = BM25Okapi(tokenized)
|
||||
logger.info("BM25Retriever: index built with %d chunks", len(chunks))
|
||||
@@ -127,20 +155,26 @@ class BM25Retriever:
|
||||
for score, chunk in ranked[: top_k * 2]:
|
||||
if score <= 0:
|
||||
break
|
||||
# Apply simple regulation_type filter if provided
|
||||
if filters and chunk.metadata.get("regulation_type"):
|
||||
types = [t.strip() for t in filters.split(",")]
|
||||
if chunk.metadata.get("regulation_type") not in types:
|
||||
continue
|
||||
if filters:
|
||||
normalized_filter = filters.replace("doc_name", "doc_title").strip()
|
||||
if normalized_filter.startswith('doc_title == "'):
|
||||
expected_title = normalized_filter[len('doc_title == "'):-1]
|
||||
if chunk.doc_title != expected_title:
|
||||
continue
|
||||
results.append(
|
||||
RetrievedChunk(
|
||||
chunk_id=chunk.chunk_id,
|
||||
doc_id=chunk.doc_id,
|
||||
doc_name=chunk.doc_name,
|
||||
content=chunk.content,
|
||||
doc_title=chunk.doc_title,
|
||||
text=chunk.text,
|
||||
score=score,
|
||||
chunk_type=chunk.chunk_type,
|
||||
section_title=chunk.section_title,
|
||||
page_number=chunk.page_number,
|
||||
page_start=chunk.page_start,
|
||||
page_end=chunk.page_end,
|
||||
section_level=chunk.section_level,
|
||||
chunk_index=chunk.chunk_index,
|
||||
piece_index=chunk.piece_index,
|
||||
metadata=chunk.metadata,
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user