Refactor document handling and update Milvus collection settings

- Removed multiple failed document entries from `documents.json`.
- Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`.
- Updated architecture documentation to reflect changes in the Milvus collection name.
- Adjusted requirements by removing the sqlalchemy dependency.
- Modified test cases to align with new document structure and naming conventions.
- Introduced a new test file for Milvus vector index runtime recovery and error handling.
- Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
ash66
2026-05-26 20:21:31 +08:00
parent fec22a3a2c
commit 30c7bda389
42 changed files with 7482 additions and 569 deletions

View File

@@ -10,6 +10,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
"""Adapt the existing markdown chunker to the new chunk builder port."""
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
"""Initialize the local markdown chunk builder."""
self.chunker = RegulationChunker(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
@@ -22,6 +23,7 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
regulation_type: str,
version: str,
) -> list[Chunk]:
"""Build migrated chunk objects from the legacy markdown chunker output."""
markdown_text = parsed_document.raw_text.strip()
if not markdown_text:
return []
@@ -50,16 +52,18 @@ class LocalRegulationChunkBuilder(ChunkBuilder):
Chunk(
chunk_id=item.metadata.chunk_id,
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content=item.content,
doc_title=parsed_document.doc_name,
text=item.content,
embedding_text=item.content,
chunk_type="local_markdown_chunk",
section_title=item.metadata.section_title or item.metadata.section_number,
section_path=section_path,
page_number=item.metadata.page_number,
page_start=item.metadata.page_number,
page_end=item.metadata.page_number,
section_level=len(section_path),
regulation_type=regulation_type,
version=version,
semantic_id=item.metadata.clause_number,
block_type="local_markdown_chunk",
metadata=metadata,
)
)