Fix SSE route dependency and align architecture docs
This commit is contained in:
66
backend/app/infrastructure/parser/local_chunk_builder.py
Normal file
66
backend/app/infrastructure/parser/local_chunk_builder.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Local chunk builder adapter for the migrated backend architecture."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
||||
from app.services.embedding.text_chunker import RegulationChunker
|
||||
|
||||
|
||||
class LocalRegulationChunkBuilder(ChunkBuilder):
|
||||
"""Adapt the existing markdown chunker to the new chunk builder port."""
|
||||
|
||||
def __init__(self, *, chunk_size: int = 512, chunk_overlap: int = 50) -> None:
|
||||
self.chunker = RegulationChunker(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
def build(
|
||||
self,
|
||||
*,
|
||||
parsed_document: ParsedDocument,
|
||||
regulation_type: str,
|
||||
version: str,
|
||||
) -> list[Chunk]:
|
||||
markdown_text = parsed_document.raw_text.strip()
|
||||
if not markdown_text:
|
||||
return []
|
||||
|
||||
legacy_chunks = self.chunker.chunk_document(
|
||||
markdown_text,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
)
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
for item in legacy_chunks:
|
||||
metadata = {
|
||||
"section_number": item.metadata.section_number,
|
||||
"section_title": item.metadata.section_title,
|
||||
"clause_number": item.metadata.clause_number,
|
||||
"start_position": item.metadata.start_position,
|
||||
"end_position": item.metadata.end_position,
|
||||
"token_count": item.token_count,
|
||||
"source": "local_chunker",
|
||||
}
|
||||
section_path = [value for value in [item.metadata.section_number, item.metadata.section_title] if value]
|
||||
chunks.append(
|
||||
Chunk(
|
||||
chunk_id=item.metadata.chunk_id,
|
||||
doc_id=parsed_document.doc_id,
|
||||
doc_name=parsed_document.doc_name,
|
||||
content=item.content,
|
||||
embedding_text=item.content,
|
||||
section_title=item.metadata.section_title or item.metadata.section_number,
|
||||
section_path=section_path,
|
||||
page_number=item.metadata.page_number,
|
||||
regulation_type=regulation_type,
|
||||
version=version,
|
||||
semantic_id=item.metadata.clause_number,
|
||||
block_type="local_markdown_chunk",
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
return chunks
|
||||
Reference in New Issue
Block a user