"""Provide utility helpers for chunking.""" import re from typing import List from app.core.config import settings # Keep module behavior explicit so the backend flow stays easy to audit. class TextChunker: """Represent the Text Chunker type.""" def __init__( self, chunk_size: int = settings.chunk_size, chunk_overlap: int = settings.chunk_overlap, ): """Initialize the Text Chunker instance.""" self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def chunk_by_clause(self, text: str) -> List[dict]: """Handle chunk by clause for the Text Chunker instance.""" clause_pattern = r"(第[一二三四五六七八九十百]+条)" parts = re.split(clause_pattern, text) chunks = [] current_clause = None current_text = "" chunk_index = 0 for part in parts: if re.match(clause_pattern, part): if current_clause and current_text.strip(): chunks.append({ "clause_id": current_clause, "content": current_text.strip(), "chunk_index": chunk_index, }) chunk_index += 1 current_clause = part current_text = "" else: current_text += part if current_clause and current_text.strip(): chunks.append({ "clause_id": current_clause, "content": current_text.strip(), "chunk_index": chunk_index, }) return chunks def chunk_by_size(self, text: str) -> List[dict]: """Handle chunk by size for the Text Chunker instance.""" chunks = [] start = 0 chunk_index = 0 while start < len(text): end = start + self.chunk_size chunk_text = text[start:end] if chunk_text.strip(): chunks.append({ "content": chunk_text.strip(), "chunk_index": chunk_index, "start_pos": start, "end_pos": end, }) chunk_index += 1 start = end - self.chunk_overlap return chunks def estimate_tokens(self, text: str) -> int: """Handle estimate tokens for the Text Chunker instance.""" chinese_chars = len(re.findall(r"[^\x00-\xff]", text)) english_chars = len(text) - chinese_chars return int(chinese_chars / 1.5 + english_chars / 4) chunker = TextChunker()