初始化
This commit is contained in:
78
app/utils/chunking.py
Normal file
78
app/utils/chunking.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import re
|
||||
from typing import List
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
class TextChunker:
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = settings.chunk_size,
|
||||
chunk_overlap: int = settings.chunk_overlap,
|
||||
):
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
def chunk_by_clause(self, text: str) -> List[dict]:
|
||||
"""按条款边界分块(适用于法规文档)"""
|
||||
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
|
||||
parts = re.split(clause_pattern, text)
|
||||
|
||||
chunks = []
|
||||
current_clause = None
|
||||
current_text = ""
|
||||
chunk_index = 0
|
||||
|
||||
for part in parts:
|
||||
if re.match(clause_pattern, part):
|
||||
if current_clause and current_text.strip():
|
||||
chunks.append({
|
||||
"clause_id": current_clause,
|
||||
"content": current_text.strip(),
|
||||
"chunk_index": chunk_index,
|
||||
})
|
||||
chunk_index += 1
|
||||
current_clause = part
|
||||
current_text = ""
|
||||
else:
|
||||
current_text += part
|
||||
|
||||
if current_clause and current_text.strip():
|
||||
chunks.append({
|
||||
"clause_id": current_clause,
|
||||
"content": current_text.strip(),
|
||||
"chunk_index": chunk_index,
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_by_size(self, text: str) -> List[dict]:
|
||||
"""按固定大小分块"""
|
||||
chunks = []
|
||||
start = 0
|
||||
chunk_index = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + self.chunk_size
|
||||
chunk_text = text[start:end]
|
||||
|
||||
if chunk_text.strip():
|
||||
chunks.append({
|
||||
"content": chunk_text.strip(),
|
||||
"chunk_index": chunk_index,
|
||||
"start_pos": start,
|
||||
"end_pos": end,
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
start = end - self.chunk_overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def estimate_tokens(self, text: str) -> int:
|
||||
"""估算token数量"""
|
||||
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
|
||||
english_chars = len(text) - chinese_chars
|
||||
return int(chinese_chars / 1.5 + english_chars / 4)
|
||||
|
||||
|
||||
chunker = TextChunker()
|
||||
Reference in New Issue
Block a user