Files
AIRegulation-Demo-Test-Backend/app/utils/chunking.py

78 lines
2.3 KiB
Python
Raw Normal View History

2026-05-11 11:22:55 +08:00
import re
from typing import List
from app.core.config import settings
class TextChunker:
def __init__(
self,
chunk_size: int = settings.chunk_size,
chunk_overlap: int = settings.chunk_overlap,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_by_clause(self, text: str) -> List[dict]:
"""按条款边界分块(适用于法规文档)"""
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
parts = re.split(clause_pattern, text)
chunks = []
current_clause = None
current_text = ""
chunk_index = 0
for part in parts:
if re.match(clause_pattern, part):
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
chunk_index += 1
current_clause = part
current_text = ""
else:
current_text += part
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
return chunks
def chunk_by_size(self, text: str) -> List[dict]:
"""按固定大小分块"""
chunks = []
start = 0
chunk_index = 0
while start < len(text):
end = start + self.chunk_size
chunk_text = text[start:end]
if chunk_text.strip():
chunks.append({
"content": chunk_text.strip(),
"chunk_index": chunk_index,
"start_pos": start,
"end_pos": end,
})
chunk_index += 1
start = end - self.chunk_overlap
return chunks
def estimate_tokens(self, text: str) -> int:
"""估算token数量"""
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
english_chars = len(text) - chinese_chars
return int(chinese_chars / 1.5 + english_chars / 4)
chunker = TextChunker()