AIRegulation-DocAnalysis/backend/app/infrastructure/parser/vector_chunk_builder.py

"""Implement infrastructure support for vector chunk builder."""

from __future__ import annotations

from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.


class AliyunVectorChunkBuilder(ChunkBuilder):
    """Provide the Aliyun Vector Chunk Builder builder."""
    def build(
        self,
        *,
        parsed_document: ParsedDocument,
        regulation_type: str,
        version: str,
    ) -> list[Chunk]:
        """Handle build for the Aliyun Vector Chunk Builder instance."""
        chunks: list[Chunk] = []
        for index, item in enumerate(parsed_document.vector_chunks):
            text = item.get("text") or ""
            embedding_text = item.get("embedding_text") or text
            if not embedding_text.strip():
                continue
            section_path = item.get("section_path") or []
            section_title = item.get("section_title") or (section_path[-1] if section_path else "")
            chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
            metadata = dict(item)
            metadata["regulation_type"] = regulation_type
            metadata["version"] = version
            chunks.append(
                Chunk(
                    chunk_id=str(chunk_id),
                    doc_id=parsed_document.doc_id,
                    doc_title=str(item.get("doc_title") or parsed_document.doc_name),
                    text=text,
                    embedding_text=embedding_text,
                    chunk_type=str(item.get("chunk_type", item.get("block_type", ""))),
                    chunk_index=int(item.get("chunk_index") or 0),
                    piece_index=int(item.get("piece_index") or 0),
                    page_start=int(item.get("page_start") or 0),
                    page_end=int(item.get("page_end") or 0),
                    section_title=section_title,
                    section_path=section_path,
                    section_level=int(item.get("section_level") or len(section_path)),
                    source_ids=[str(v) for v in item.get("source_ids", [])],
                    regulation_type=regulation_type,
                    version=version,
                    semantic_id=item.get("semantic_id", ""),
                    metadata=metadata,
                )
            )
        return chunks