初始化

This commit is contained in:
2026-05-11 11:22:55 +08:00
parent 5f6c571434
commit 80dcd070f7
39 changed files with 1997 additions and 0 deletions

4
app/utils/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .chunking import TextChunker, chunker
from .logger import logger, setup_logging
__all__ = ["TextChunker", "chunker", "logger", "setup_logging"]

78
app/utils/chunking.py Normal file
View File

@@ -0,0 +1,78 @@
import re
from typing import List
from app.core.config import settings
class TextChunker:
def __init__(
self,
chunk_size: int = settings.chunk_size,
chunk_overlap: int = settings.chunk_overlap,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def chunk_by_clause(self, text: str) -> List[dict]:
"""按条款边界分块(适用于法规文档)"""
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
parts = re.split(clause_pattern, text)
chunks = []
current_clause = None
current_text = ""
chunk_index = 0
for part in parts:
if re.match(clause_pattern, part):
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
chunk_index += 1
current_clause = part
current_text = ""
else:
current_text += part
if current_clause and current_text.strip():
chunks.append({
"clause_id": current_clause,
"content": current_text.strip(),
"chunk_index": chunk_index,
})
return chunks
def chunk_by_size(self, text: str) -> List[dict]:
"""按固定大小分块"""
chunks = []
start = 0
chunk_index = 0
while start < len(text):
end = start + self.chunk_size
chunk_text = text[start:end]
if chunk_text.strip():
chunks.append({
"content": chunk_text.strip(),
"chunk_index": chunk_index,
"start_pos": start,
"end_pos": end,
})
chunk_index += 1
start = end - self.chunk_overlap
return chunks
def estimate_tokens(self, text: str) -> int:
"""估算token数量"""
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
english_chars = len(text) - chinese_chars
return int(chinese_chars / 1.5 + english_chars / 4)
chunker = TextChunker()

24
app/utils/logger.py Normal file
View File

@@ -0,0 +1,24 @@
import logging
import sys
def setup_logging() -> logging.Logger:
"""配置日志"""
logger = logging.getLogger("app")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
logger = setup_logging()