"""Implement infrastructure support for vector chunk builder.""" from __future__ import annotations from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument # Keep adapter behavior explicit so integration details remain easy to audit. class AliyunVectorChunkBuilder(ChunkBuilder): """Provide the Aliyun Vector Chunk Builder builder.""" def build( self, *, parsed_document: ParsedDocument, regulation_type: str, version: str, ) -> list[Chunk]: """Handle build for the Aliyun Vector Chunk Builder instance.""" chunks: list[Chunk] = [] for index, item in enumerate(parsed_document.vector_chunks): text = item.get("text") or "" embedding_text = item.get("embedding_text") or text if not embedding_text.strip(): continue section_path = item.get("section_path") or [] section_title = item.get("section_title") or (section_path[-1] if section_path else "") chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}" metadata = dict(item) metadata["regulation_type"] = regulation_type metadata["version"] = version chunks.append( Chunk( chunk_id=str(chunk_id), doc_id=parsed_document.doc_id, doc_title=str(item.get("doc_title") or parsed_document.doc_name), text=text, embedding_text=embedding_text, chunk_type=str(item.get("chunk_type", item.get("block_type", ""))), chunk_index=int(item.get("chunk_index") or 0), piece_index=int(item.get("piece_index") or 0), page_start=int(item.get("page_start") or 0), page_end=int(item.get("page_end") or 0), section_title=section_title, section_path=section_path, section_level=int(item.get("section_level") or len(section_path)), source_ids=[str(v) for v in item.get("source_ids", [])], regulation_type=regulation_type, version=version, semantic_id=item.get("semantic_id", ""), metadata=metadata, ) ) return chunks