"""Local chunk builder adapter for the migrated backend architecture.""" from __future__ import annotations from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument from app.services.embedding.text_chunker import RegulationChunker class LocalRegulationChunkBuilder(ChunkBuilder): """Adapt the existing markdown chunker to the new chunk builder port.""" def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None: """Initialize the local markdown chunk builder.""" self.chunker = RegulationChunker( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) def build( self, *, parsed_document: ParsedDocument, regulation_type: str, version: str, ) -> list[Chunk]: """Build migrated chunk objects from the legacy markdown chunker output.""" markdown_text = parsed_document.raw_text.strip() if not markdown_text: return [] legacy_chunks = self.chunker.chunk_document( markdown_text, doc_id=parsed_document.doc_id, doc_name=parsed_document.doc_name, regulation_type=regulation_type, version=version, ) chunks: list[Chunk] = [] for item in legacy_chunks: metadata = { "section_number": item.metadata.section_number, "section_title": item.metadata.section_title, "clause_number": item.metadata.clause_number, "start_position": item.metadata.start_position, "end_position": item.metadata.end_position, "token_count": item.token_count, "source": "local_chunker", } section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value] chunks.append( Chunk( chunk_id=item.metadata.chunk_id, doc_id=parsed_document.doc_id, doc_title=parsed_document.doc_name, text=item.content, embedding_text=item.content, chunk_type="local_markdown_chunk", section_title=item.metadata.section_title or item.metadata.section_number, section_path=section_path, page_start=item.metadata.page_number, page_end=item.metadata.page_number, section_level=len(section_path), regulation_type=regulation_type, version=version, semantic_id=item.metadata.clause_number, metadata=metadata, ) ) return chunks