84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
"""Provide utility helpers for chunking."""
|
|
|
|
import re
|
|
from typing import List
|
|
from app.core.config import settings
|
|
# Keep module behavior explicit so the backend flow stays easy to audit.
|
|
|
|
|
|
|
|
class TextChunker:
|
|
"""Represent the Text Chunker type."""
|
|
def __init__(
|
|
self,
|
|
chunk_size: int = settings.chunk_size,
|
|
chunk_overlap: int = settings.chunk_overlap,
|
|
):
|
|
"""Initialize the Text Chunker instance."""
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
|
|
def chunk_by_clause(self, text: str) -> List[dict]:
|
|
"""Handle chunk by clause for the Text Chunker instance."""
|
|
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
|
|
parts = re.split(clause_pattern, text)
|
|
|
|
chunks = []
|
|
current_clause = None
|
|
current_text = ""
|
|
chunk_index = 0
|
|
|
|
for part in parts:
|
|
if re.match(clause_pattern, part):
|
|
if current_clause and current_text.strip():
|
|
chunks.append({
|
|
"clause_id": current_clause,
|
|
"content": current_text.strip(),
|
|
"chunk_index": chunk_index,
|
|
})
|
|
chunk_index += 1
|
|
current_clause = part
|
|
current_text = ""
|
|
else:
|
|
current_text += part
|
|
|
|
if current_clause and current_text.strip():
|
|
chunks.append({
|
|
"clause_id": current_clause,
|
|
"content": current_text.strip(),
|
|
"chunk_index": chunk_index,
|
|
})
|
|
|
|
return chunks
|
|
|
|
def chunk_by_size(self, text: str) -> List[dict]:
|
|
"""Handle chunk by size for the Text Chunker instance."""
|
|
chunks = []
|
|
start = 0
|
|
chunk_index = 0
|
|
|
|
while start < len(text):
|
|
end = start + self.chunk_size
|
|
chunk_text = text[start:end]
|
|
|
|
if chunk_text.strip():
|
|
chunks.append({
|
|
"content": chunk_text.strip(),
|
|
"chunk_index": chunk_index,
|
|
"start_pos": start,
|
|
"end_pos": end,
|
|
})
|
|
chunk_index += 1
|
|
|
|
start = end - self.chunk_overlap
|
|
|
|
return chunks
|
|
|
|
def estimate_tokens(self, text: str) -> int:
|
|
"""Handle estimate tokens for the Text Chunker instance."""
|
|
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
|
|
english_chars = len(text) - chinese_chars
|
|
return int(chinese_chars / 1.5 + english_chars / 4)
|
|
|
|
|
|
chunker = TextChunker() |