- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
71 lines
2.7 KiB
Python
71 lines
2.7 KiB
Python
"""Local chunk builder adapter for the migrated backend architecture."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
|
from app.services.embedding.text_chunker import RegulationChunker
|
|
|
|
|
|
class LocalRegulationChunkBuilder(ChunkBuilder):
|
|
"""Adapt the existing markdown chunker to the new chunk builder port."""
|
|
|
|
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
|
|
"""Initialize the local markdown chunk builder."""
|
|
self.chunker = RegulationChunker(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
)
|
|
|
|
def build(
|
|
self,
|
|
*,
|
|
parsed_document: ParsedDocument,
|
|
regulation_type: str,
|
|
version: str,
|
|
) -> list[Chunk]:
|
|
"""Build migrated chunk objects from the legacy markdown chunker output."""
|
|
markdown_text = parsed_document.raw_text.strip()
|
|
if not markdown_text:
|
|
return []
|
|
|
|
legacy_chunks = self.chunker.chunk_document(
|
|
markdown_text,
|
|
doc_id=parsed_document.doc_id,
|
|
doc_name=parsed_document.doc_name,
|
|
regulation_type=regulation_type,
|
|
version=version,
|
|
)
|
|
|
|
chunks: list[Chunk] = []
|
|
for item in legacy_chunks:
|
|
metadata = {
|
|
"section_number": item.metadata.section_number,
|
|
"section_title": item.metadata.section_title,
|
|
"clause_number": item.metadata.clause_number,
|
|
"start_position": item.metadata.start_position,
|
|
"end_position": item.metadata.end_position,
|
|
"token_count": item.token_count,
|
|
"source": "local_chunker",
|
|
}
|
|
section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
|
|
chunks.append(
|
|
Chunk(
|
|
chunk_id=item.metadata.chunk_id,
|
|
doc_id=parsed_document.doc_id,
|
|
doc_title=parsed_document.doc_name,
|
|
text=item.content,
|
|
embedding_text=item.content,
|
|
chunk_type="local_markdown_chunk",
|
|
section_title=item.metadata.section_title or item.metadata.section_number,
|
|
section_path=section_path,
|
|
page_start=item.metadata.page_number,
|
|
page_end=item.metadata.page_number,
|
|
section_level=len(section_path),
|
|
regulation_type=regulation_type,
|
|
version=version,
|
|
semantic_id=item.metadata.clause_number,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
return chunks
|