初始化

2026-05-11 11:22:55 +08:00
parent 5f6c571434
commit 80dcd070f7
39 changed files with 1997 additions and 0 deletions
--- a/app/utils/chunking.py
+++ b/app/utils/chunking.py
@@ -0,0 +1,78 @@
+import re
+from typing import List
+from app.core.config import settings
+
+
+class TextChunker:
+    def __init__(
+        self,
+        chunk_size: int = settings.chunk_size,
+        chunk_overlap: int = settings.chunk_overlap,
+    ):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+    def chunk_by_clause(self, text: str) -> List[dict]:
+        """按条款边界分块（适用于法规文档）"""
+        clause_pattern = r"(第[一二三四五六七八九十百]+条)"
+        parts = re.split(clause_pattern, text)
+
+        chunks = []
+        current_clause = None
+        current_text = ""
+        chunk_index = 0
+
+        for part in parts:
+            if re.match(clause_pattern, part):
+                if current_clause and current_text.strip():
+                    chunks.append({
+                        "clause_id": current_clause,
+                        "content": current_text.strip(),
+                        "chunk_index": chunk_index,
+                    })
+                    chunk_index += 1
+                current_clause = part
+                current_text = ""
+            else:
+                current_text += part
+
+        if current_clause and current_text.strip():
+            chunks.append({
+                "clause_id": current_clause,
+                "content": current_text.strip(),
+                "chunk_index": chunk_index,
+            })
+
+        return chunks
+
+    def chunk_by_size(self, text: str) -> List[dict]:
+        """按固定大小分块"""
+        chunks = []
+        start = 0
+        chunk_index = 0
+
+        while start < len(text):
+            end = start + self.chunk_size
+            chunk_text = text[start:end]
+
+            if chunk_text.strip():
+                chunks.append({
+                    "content": chunk_text.strip(),
+                    "chunk_index": chunk_index,
+                    "start_pos": start,
+                    "end_pos": end,
+                })
+                chunk_index += 1
+
+            start = end - self.chunk_overlap
+
+        return chunks
+
+    def estimate_tokens(self, text: str) -> int:
+        """估算token数量"""
+        chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
+        english_chars = len(text) - chinese_chars
+        return int(chinese_chars / 1.5 + english_chars / 4)
+
+
+chunker = TextChunker()