Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,19 +1,25 @@
|
||||
"""Provide utility helpers for chunking."""
|
||||
|
||||
import re
|
||||
from typing import List
|
||||
from app.core.config import settings
|
||||
# Keep module behavior explicit so the backend flow stays easy to audit.
|
||||
|
||||
|
||||
|
||||
class TextChunker:
|
||||
"""Represent the Text Chunker type."""
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = settings.chunk_size,
|
||||
chunk_overlap: int = settings.chunk_overlap,
|
||||
):
|
||||
"""Initialize the Text Chunker instance."""
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
def chunk_by_clause(self, text: str) -> List[dict]:
|
||||
"""按条款边界分块(适用于法规文档)"""
|
||||
"""Handle chunk by clause for the Text Chunker instance."""
|
||||
clause_pattern = r"(第[一二三四五六七八九十百]+条)"
|
||||
parts = re.split(clause_pattern, text)
|
||||
|
||||
@@ -46,7 +52,7 @@ class TextChunker:
|
||||
return chunks
|
||||
|
||||
def chunk_by_size(self, text: str) -> List[dict]:
|
||||
"""按固定大小分块"""
|
||||
"""Handle chunk by size for the Text Chunker instance."""
|
||||
chunks = []
|
||||
start = 0
|
||||
chunk_index = 0
|
||||
@@ -69,7 +75,7 @@ class TextChunker:
|
||||
return chunks
|
||||
|
||||
def estimate_tokens(self, text: str) -> int:
|
||||
"""估算token数量"""
|
||||
"""Handle estimate tokens for the Text Chunker instance."""
|
||||
chinese_chars = len(re.findall(r"[^\x00-\xff]", text))
|
||||
english_chars = len(text) - chinese_chars
|
||||
return int(chinese_chars / 1.5 + english_chars / 4)
|
||||
|
||||
Reference in New Issue
Block a user