78 lines
2.3 KiB
Python
78 lines
2.3 KiB
Python
|
|
import re
|
||
|
|
from typing import List
|
||
|
|
from app.core.config import settings
|
||
|
|
|
||
|
|
|
||
|
|
class TextChunker:
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
chunk_size: int = settings.chunk_size,
|
||
|
|
chunk_overlap: int = settings.chunk_overlap,
|
||
|
|
):
|
||
|
|
self.chunk_size = chunk_size
|
||
|
|
self.chunk_overlap = chunk_overlap
|
||
|
|
|
||
|
|
def chunk_by_clause(self, text: str) -> List[dict]:
|
||
|
|
"""按条款边界分块(适用于法规文档)"""
|
||
|
|
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
|
||
|
|
parts = re.split(clause_pattern, text)
|
||
|
|
|
||
|
|
chunks = []
|
||
|
|
current_clause = None
|
||
|
|
current_text = ""
|
||
|
|
chunk_index = 0
|
||
|
|
|
||
|
|
for part in parts:
|
||
|
|
if re.match(clause_pattern, part):
|
||
|
|
if current_clause and current_text.strip():
|
||
|
|
chunks.append({
|
||
|
|
"clause_id": current_clause,
|
||
|
|
"content": current_text.strip(),
|
||
|
|
"chunk_index": chunk_index,
|
||
|
|
})
|
||
|
|
chunk_index += 1
|
||
|
|
current_clause = part
|
||
|
|
current_text = ""
|
||
|
|
else:
|
||
|
|
current_text += part
|
||
|
|
|
||
|
|
if current_clause and current_text.strip():
|
||
|
|
chunks.append({
|
||
|
|
"clause_id": current_clause,
|
||
|
|
"content": current_text.strip(),
|
||
|
|
"chunk_index": chunk_index,
|
||
|
|
})
|
||
|
|
|
||
|
|
return chunks
|
||
|
|
|
||
|
|
def chunk_by_size(self, text: str) -> List[dict]:
|
||
|
|
"""按固定大小分块"""
|
||
|
|
chunks = []
|
||
|
|
start = 0
|
||
|
|
chunk_index = 0
|
||
|
|
|
||
|
|
while start < len(text):
|
||
|
|
end = start + self.chunk_size
|
||
|
|
chunk_text = text[start:end]
|
||
|
|
|
||
|
|
if chunk_text.strip():
|
||
|
|
chunks.append({
|
||
|
|
"content": chunk_text.strip(),
|
||
|
|
"chunk_index": chunk_index,
|
||
|
|
"start_pos": start,
|
||
|
|
"end_pos": end,
|
||
|
|
})
|
||
|
|
chunk_index += 1
|
||
|
|
|
||
|
|
start = end - self.chunk_overlap
|
||
|
|
|
||
|
|
return chunks
|
||
|
|
|
||
|
|
def estimate_tokens(self, text: str) -> int:
|
||
|
|
"""估算token数量"""
|
||
|
|
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
|
||
|
|
english_chars = len(text) - chinese_chars
|
||
|
|
return int(chinese_chars / 1.5 + english_chars / 4)
|
||
|
|
|
||
|
|
|
||
|
|
chunker = TextChunker()
|