AIRegulation-DocAnalysis/backend/app/infrastructure/parser/vector_chunk_builder.py

"""Implement infrastructure support for vector chunk builder."""

from __future__ import annotations

from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
# Keep adapter behavior explicit so integration details remain easy to audit.


class AliyunVectorChunkBuilder(ChunkBuilder):
    """Provide the Aliyun Vector Chunk Builder builder."""
    def build(
        self,
        *,
        parsed_document: ParsedDocument,
        regulation_type: str,
        version: str,
    ) -> list[Chunk]:
        """Handle build for the Aliyun Vector Chunk Builder instance."""
        chunks: list[Chunk] = []
        for index, item in enumerate(parsed_document.vector_chunks):
            content = item.get("content") or item.get("text") or ""
            embedding_text = item.get("embedding_text") or content
            if not embedding_text.strip():
                continue
            section_path = item.get("section_path") or []
            section_title = item.get("section_title") or (section_path[-1] if section_path else "")
            page_number = item.get("page_start") or item.get("page") or 0
            chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
            metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
            chunks.append(
                Chunk(
                    chunk_id=str(chunk_id),
                    doc_id=parsed_document.doc_id,
                    doc_name=parsed_document.doc_name,
                    content=content,
                    embedding_text=embedding_text,
                    section_title=section_title,
                    section_path=section_path,
                    page_number=int(page_number or 0),
                    regulation_type=regulation_type,
                    version=version,
                    semantic_id=item.get("semantic_id", ""),
                    block_type=item.get("block_type", ""),
                    metadata=metadata,
                )
            )
        return chunks
Fix SSE route dependency and align architecture docs 2026-05-18 16:32:42 +08:00			`"""Implement infrastructure support for vector chunk builder."""`

			`from __future__ import annotations`

			`from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument`
			`# Keep adapter behavior explicit so integration details remain easy to audit.`



			`class AliyunVectorChunkBuilder(ChunkBuilder):`
			`"""Provide the Aliyun Vector Chunk Builder builder."""`
			`def build(`
			`self,`
			`*,`
			`parsed_document: ParsedDocument,`
			`regulation_type: str,`
			`version: str,`
			`) -> list[Chunk]:`
			`"""Handle build for the Aliyun Vector Chunk Builder instance."""`
			`chunks: list[Chunk] = []`
			`for index, item in enumerate(parsed_document.vector_chunks):`
			`content = item.get("content") or item.get("text") or ""`
			`embedding_text = item.get("embedding_text") or content`
			`if not embedding_text.strip():`
			`continue`
			`section_path = item.get("section_path") or []`
			`section_title = item.get("section_title") or (section_path[-1] if section_path else "")`
			`page_number = item.get("page_start") or item.get("page") or 0`
			`chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"`
			`metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}`
			`chunks.append(`
			`Chunk(`
			`chunk_id=str(chunk_id),`
			`doc_id=parsed_document.doc_id,`
			`doc_name=parsed_document.doc_name,`
			`content=content,`
			`embedding_text=embedding_text,`
			`section_title=section_title,`
			`section_path=section_path,`
			`page_number=int(page_number or 0),`
			`regulation_type=regulation_type,`
			`version=version,`
			`semantic_id=item.get("semantic_id", ""),`
			`block_type=item.get("block_type", ""),`
			`metadata=metadata,`
			`)`
			`)`
			`return chunks`