Files
AIRegulation-DocAnalysis/backend/app/infrastructure/parser/local_chunk_builder.py

71 lines
2.7 KiB
Python
Raw Permalink Normal View History

"""Local chunk builder adapter for the migrated backend architecture."""
from __future__ import annotations
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
from app.services.embedding.text_chunker import RegulationChunker
class LocalRegulationChunkBuilder(ChunkBuilder):
"""Adapt the existing markdown chunker to the new chunk builder port."""
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
"""Initialize the local markdown chunk builder."""
self.chunker = RegulationChunker(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
def build(
self,
*,
parsed_document: ParsedDocument,
regulation_type: str,
version: str,
) -> list[Chunk]:
"""Build migrated chunk objects from the legacy markdown chunker output."""
markdown_text = parsed_document.raw_text.strip()
if not markdown_text:
return []
legacy_chunks = self.chunker.chunk_document(
markdown_text,
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
regulation_type=regulation_type,
version=version,
)
chunks: list[Chunk] = []
for item in legacy_chunks:
metadata = {
"section_number": item.metadata.section_number,
"section_title": item.metadata.section_title,
"clause_number": item.metadata.clause_number,
"start_position": item.metadata.start_position,
"end_position": item.metadata.end_position,
"token_count": item.token_count,
"source": "local_chunker",
}
section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
chunks.append(
Chunk(
chunk_id=item.metadata.chunk_id,
doc_id=parsed_document.doc_id,
doc_title=parsed_document.doc_name,
text=item.content,
embedding_text=item.content,
chunk_type="local_markdown_chunk",
section_title=item.metadata.section_title or item.metadata.section_number,
section_path=section_path,
page_start=item.metadata.page_number,
page_end=item.metadata.page_number,
section_level=len(section_path),
regulation_type=regulation_type,
version=version,
semantic_id=item.metadata.clause_number,
metadata=metadata,
)
)
return chunks