Files
AIRegulation-DocAnalysis/backend/app/utils/chunking.py

84 lines
2.6 KiB
Python
Raw Permalink Normal View History

"""Provide utility helpers for chunking."""
2026-05-14 15:07:34 +08:00
import re
from typing import List
from app.core.config import settings
# Keep module behavior explicit so the backend flow stays easy to audit.
2026-05-14 15:07:34 +08:00
class TextChunker:
"""Represent the Text Chunker type."""
2026-05-14 15:07:34 +08:00
def __init__(
self,
chunk_size: int = settings.chunk_size,
chunk_overlap: int = settings.chunk_overlap,
):
"""Initialize the Text Chunker instance."""
2026-05-14 15:07:34 +08:00
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_by_clause(self, text: str) -> List[dict]:
"""Handle chunk by clause for the Text Chunker instance."""
2026-05-14 15:07:34 +08:00
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
parts = re.split(clause_pattern, text)
chunks = []
current_clause = None
current_text = ""
chunk_index = 0
for part in parts:
if re.match(clause_pattern, part):
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
chunk_index += 1
current_clause = part
current_text = ""
else:
current_text += part
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
return chunks
def chunk_by_size(self, text: str) -> List[dict]:
"""Handle chunk by size for the Text Chunker instance."""
2026-05-14 15:07:34 +08:00
chunks = []
start = 0
chunk_index = 0
while start < len(text):
end = start + self.chunk_size
chunk_text = text[start:end]
if chunk_text.strip():
chunks.append({
"content": chunk_text.strip(),
"chunk_index": chunk_index,
"start_pos": start,
"end_pos": end,
})
chunk_index += 1
start = end - self.chunk_overlap
return chunks
def estimate_tokens(self, text: str) -> int:
"""Handle estimate tokens for the Text Chunker instance."""
2026-05-14 15:07:34 +08:00
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
english_chars = len(text) - chinese_chars
return int(chinese_chars / 1.5 + english_chars / 4)
chunker = TextChunker()