49 lines
2.0 KiB
Python
49 lines
2.0 KiB
Python
|
|
"""Implement infrastructure support for vector chunk builder."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from app.domain.documents import Chunk, ChunkBuilder, ParsedDocument
|
||
|
|
# Keep adapter behavior explicit so integration details remain easy to audit.
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
class AliyunVectorChunkBuilder(ChunkBuilder):
|
||
|
|
"""Provide the Aliyun Vector Chunk Builder builder."""
|
||
|
|
def build(
|
||
|
|
self,
|
||
|
|
*,
|
||
|
|
parsed_document: ParsedDocument,
|
||
|
|
regulation_type: str,
|
||
|
|
version: str,
|
||
|
|
) -> list[Chunk]:
|
||
|
|
"""Handle build for the Aliyun Vector Chunk Builder instance."""
|
||
|
|
chunks: list[Chunk] = []
|
||
|
|
for index, item in enumerate(parsed_document.vector_chunks):
|
||
|
|
content = item.get("content") or item.get("text") or ""
|
||
|
|
embedding_text = item.get("embedding_text") or content
|
||
|
|
if not embedding_text.strip():
|
||
|
|
continue
|
||
|
|
section_path = item.get("section_path") or []
|
||
|
|
section_title = item.get("section_title") or (section_path[-1] if section_path else "")
|
||
|
|
page_number = item.get("page_start") or item.get("page") or 0
|
||
|
|
chunk_id = item.get("chunk_id") or f"{parsed_document.doc_id}-chunk-{index}"
|
||
|
|
metadata = {k: v for k, v in item.items() if k not in {"content", "embedding_text"}}
|
||
|
|
chunks.append(
|
||
|
|
Chunk(
|
||
|
|
chunk_id=str(chunk_id),
|
||
|
|
doc_id=parsed_document.doc_id,
|
||
|
|
doc_name=parsed_document.doc_name,
|
||
|
|
content=content,
|
||
|
|
embedding_text=embedding_text,
|
||
|
|
section_title=section_title,
|
||
|
|
section_path=section_path,
|
||
|
|
page_number=int(page_number or 0),
|
||
|
|
regulation_type=regulation_type,
|
||
|
|
version=version,
|
||
|
|
semantic_id=item.get("semantic_id", ""),
|
||
|
|
block_type=item.get("block_type", ""),
|
||
|
|
metadata=metadata,
|
||
|
|
)
|
||
|
|
)
|
||
|
|
return chunks
|