初始化
This commit is contained in:
4
app/utils/__init__.py
Normal file
4
app/utils/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .chunking import TextChunker, chunker
|
||||
from .logger import logger, setup_logging
|
||||
|
||||
__all__ = ["TextChunker", "chunker", "logger", "setup_logging"]
|
||||
78
app/utils/chunking.py
Normal file
78
app/utils/chunking.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import re
|
||||
from typing import List
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
class TextChunker:
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = settings.chunk_size,
|
||||
chunk_overlap: int = settings.chunk_overlap,
|
||||
):
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
def chunk_by_clause(self, text: str) -> List[dict]:
|
||||
"""按条款边界分块(适用于法规文档)"""
|
||||
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
|
||||
parts = re.split(clause_pattern, text)
|
||||
|
||||
chunks = []
|
||||
current_clause = None
|
||||
current_text = ""
|
||||
chunk_index = 0
|
||||
|
||||
for part in parts:
|
||||
if re.match(clause_pattern, part):
|
||||
if current_clause and current_text.strip():
|
||||
chunks.append({
|
||||
"clause_id": current_clause,
|
||||
"content": current_text.strip(),
|
||||
"chunk_index": chunk_index,
|
||||
})
|
||||
chunk_index += 1
|
||||
current_clause = part
|
||||
current_text = ""
|
||||
else:
|
||||
current_text += part
|
||||
|
||||
if current_clause and current_text.strip():
|
||||
chunks.append({
|
||||
"clause_id": current_clause,
|
||||
"content": current_text.strip(),
|
||||
"chunk_index": chunk_index,
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_by_size(self, text: str) -> List[dict]:
|
||||
"""按固定大小分块"""
|
||||
chunks = []
|
||||
start = 0
|
||||
chunk_index = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + self.chunk_size
|
||||
chunk_text = text[start:end]
|
||||
|
||||
if chunk_text.strip():
|
||||
chunks.append({
|
||||
"content": chunk_text.strip(),
|
||||
"chunk_index": chunk_index,
|
||||
"start_pos": start,
|
||||
"end_pos": end,
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
start = end - self.chunk_overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def estimate_tokens(self, text: str) -> int:
|
||||
"""估算token数量"""
|
||||
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
|
||||
english_chars = len(text) - chinese_chars
|
||||
return int(chinese_chars / 1.5 + english_chars / 4)
|
||||
|
||||
|
||||
chunker = TextChunker()
|
||||
24
app/utils/logger.py
Normal file
24
app/utils/logger.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import logging
|
||||
import sys
|
||||
|
||||
|
||||
def setup_logging() -> logging.Logger:
|
||||
"""配置日志"""
|
||||
logger = logging.getLogger("app")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setLevel(logging.INFO)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
logger.addHandler(handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
logger = setup_logging()
|
||||
Reference in New Issue
Block a user