Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,6 +1,18 @@
|
||||
"""嵌入和分块服务"""
|
||||
"""Initialize the app.services.embedding package."""
|
||||
# Keep package boundaries explicit so backend imports stay predictable.
|
||||
|
||||
from .text_chunker import RegulationChunker
|
||||
from .bge_m3_embedder import BGEM3Embedder
|
||||
|
||||
__all__ = ["RegulationChunker", "BGEM3Embedder"]
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""Handle getattr for this module."""
|
||||
if name == "RegulationChunker":
|
||||
from .text_chunker import RegulationChunker
|
||||
|
||||
return RegulationChunker
|
||||
if name == "BGEM3Embedder":
|
||||
from .bge_m3_embedder import BGEM3Embedder
|
||||
|
||||
return BGEM3Embedder
|
||||
raise AttributeError(name)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成"""
|
||||
"""Provide service-layer logic for bge m3 embedder."""
|
||||
|
||||
import numpy as np
|
||||
from typing import List, Dict, Optional, Union
|
||||
@@ -6,43 +6,31 @@ from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
import torch
|
||||
import os
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
|
||||
# 设置HuggingFace镜像(国内网络)
|
||||
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if 'HF_ENDPOINT' not in os.environ:
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
|
||||
# 本地模型路径(按优先级检查)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
LOCAL_MODEL_PATHS = [
|
||||
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # ModelScope下载路径
|
||||
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # HuggingFace本地路径
|
||||
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmbeddingResult:
|
||||
"""嵌入结果"""
|
||||
dense_embeddings: np.ndarray # Dense向量(语义检索)
|
||||
sparse_embeddings: List[Dict[int, float]] # Sparse向量(关键词匹配)
|
||||
"""Represent the Embedding Result type."""
|
||||
dense_embeddings: np.ndarray # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
sparse_embeddings: List[Dict[int, float]] # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
texts: List[str]
|
||||
dim: int = 1024
|
||||
|
||||
|
||||
class BGEM3Embedder:
|
||||
"""
|
||||
BGE-M3多语言嵌入模型服务
|
||||
|
||||
BGE-M3是BAAI发布的多语言嵌入模型,支持:
|
||||
- Dense向量:用于语义相似度检索
|
||||
- Sparse向量:用于关键词精确匹配(BM25风格)
|
||||
- ColBERT向量:用于细粒度交互匹配(可选)
|
||||
|
||||
特点:
|
||||
- 支持100+语言(中英双语优化)
|
||||
- 8192 tokens超长上下文
|
||||
- Dense+Sparse双路检索能力
|
||||
|
||||
GitHub: https://github.com/FlagOpen/FlagEmbedding
|
||||
"""
|
||||
"""Represent the B G E M3 Embedder type."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -53,28 +41,18 @@ class BGEM3Embedder:
|
||||
max_length: int = 8192,
|
||||
local_model_path: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
初始化BGE-M3嵌入模型
|
||||
|
||||
Args:
|
||||
model_name: 模型名称(如果使用本地路径,此参数会被忽略)
|
||||
use_fp16: 是否使用FP16加速
|
||||
device: 设备类型(cuda/cpu),默认自动选择
|
||||
batch_size: 批处理大小
|
||||
max_length: 最大序列长度
|
||||
local_model_path: 本地模型路径(可选,优先使用)
|
||||
"""
|
||||
"""Initialize the B G E M3 Embedder instance."""
|
||||
self.use_fp16 = use_fp16
|
||||
self.batch_size = batch_size
|
||||
self.max_length = max_length
|
||||
|
||||
# 确定模型路径(优先使用本地路径)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if local_model_path and os.path.exists(local_model_path):
|
||||
self.model_path = local_model_path
|
||||
self.model_name = "local"
|
||||
logger.info(f"使用本地模型路径: {local_model_path}")
|
||||
else:
|
||||
# 检查多个可能的本地路径
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
found_local = False
|
||||
for path in LOCAL_MODEL_PATHS:
|
||||
if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")):
|
||||
@@ -89,7 +67,7 @@ class BGEM3Embedder:
|
||||
self.model_name = model_name
|
||||
logger.info(f"使用远程模型: {model_name}")
|
||||
|
||||
# 自动选择设备
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if device is None:
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
else:
|
||||
@@ -101,7 +79,7 @@ class BGEM3Embedder:
|
||||
self._load_model()
|
||||
|
||||
def _load_model(self):
|
||||
"""加载嵌入模型"""
|
||||
"""Handle load model for this module for the B G E M3 Embedder instance."""
|
||||
try:
|
||||
from FlagEmbedding import BGEM3FlagModel
|
||||
|
||||
@@ -127,18 +105,7 @@ class BGEM3Embedder:
|
||||
return_sparse: bool = True,
|
||||
return_colbert_vecs: bool = False
|
||||
) -> EmbeddingResult:
|
||||
"""
|
||||
对文本列表生成嵌入向量
|
||||
|
||||
Args:
|
||||
texts: 文本列表
|
||||
return_dense: 是否返回Dense向量
|
||||
return_sparse: 是否返回Sparse向量
|
||||
return_colbert_vecs: 是否返回ColBERT向量
|
||||
|
||||
Returns:
|
||||
EmbeddingResult: 嵌入结果
|
||||
"""
|
||||
"""Handle embed for the B G E M3 Embedder instance."""
|
||||
if not texts:
|
||||
logger.warning("输入文本列表为空")
|
||||
return EmbeddingResult(
|
||||
@@ -151,7 +118,7 @@ class BGEM3Embedder:
|
||||
logger.info(f"开始嵌入{len(texts)}个文本块")
|
||||
|
||||
try:
|
||||
# 执行嵌入
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
embeddings = self.model.encode(
|
||||
texts,
|
||||
batch_size=self.batch_size,
|
||||
@@ -161,11 +128,11 @@ class BGEM3Embedder:
|
||||
return_colbert_vecs=return_colbert_vecs
|
||||
)
|
||||
|
||||
# 提取结果
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
dense_embeddings = embeddings.get('dense_vecs', np.array([]))
|
||||
sparse_embeddings = embeddings.get('lexical_weights', [])
|
||||
|
||||
# 获取维度
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024
|
||||
|
||||
logger.success(f"嵌入完成,向量维度: {dim}")
|
||||
@@ -182,15 +149,7 @@ class BGEM3Embedder:
|
||||
raise
|
||||
|
||||
def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]:
|
||||
"""
|
||||
对单个文本生成嵌入向量
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
|
||||
Returns:
|
||||
Dict: 包含dense和sparse向量
|
||||
"""
|
||||
"""Embed single for the B G E M3 Embedder instance."""
|
||||
result = self.embed([text])
|
||||
return {
|
||||
'dense': result.dense_embeddings[0],
|
||||
@@ -199,25 +158,17 @@ class BGEM3Embedder:
|
||||
}
|
||||
|
||||
def embed_dense(self, texts: List[str]) -> np.ndarray:
|
||||
"""只生成Dense向量"""
|
||||
"""Embed dense for the B G E M3 Embedder instance."""
|
||||
result = self.embed(texts, return_sparse=False, return_colbert_vecs=False)
|
||||
return result.dense_embeddings
|
||||
|
||||
def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]:
|
||||
"""只生成Sparse向量"""
|
||||
"""Embed sparse for the B G E M3 Embedder instance."""
|
||||
result = self.embed(texts, return_dense=False, return_colbert_vecs=False)
|
||||
return result.sparse_embeddings
|
||||
|
||||
def embed_query(self, query: str) -> Dict:
|
||||
"""
|
||||
对查询文本生成嵌入(用于检索)
|
||||
|
||||
Args:
|
||||
query: 查询文本
|
||||
|
||||
Returns:
|
||||
Dict: 包含dense和sparse向量
|
||||
"""
|
||||
"""Embed query for the B G E M3 Embedder instance."""
|
||||
return self.embed_single(query)
|
||||
|
||||
def compute_similarity(
|
||||
@@ -226,26 +177,16 @@ class BGEM3Embedder:
|
||||
doc_embeddings: np.ndarray,
|
||||
metric: str = "cosine"
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
计算查询与文档的相似度
|
||||
|
||||
Args:
|
||||
query_embedding: 查询向量
|
||||
doc_embeddings: 文档向量矩阵
|
||||
metric: 相似度度量(cosine/dot)
|
||||
|
||||
Returns:
|
||||
np.ndarray: 相似度分数数组
|
||||
"""
|
||||
"""Handle compute similarity for the B G E M3 Embedder instance."""
|
||||
if metric == "cosine":
|
||||
# 余弦相似度
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
query_norm = np.linalg.norm(query_embedding)
|
||||
doc_norms = np.linalg.norm(doc_embeddings, axis=1)
|
||||
|
||||
similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm)
|
||||
|
||||
elif metric == "dot":
|
||||
# 点积相似度
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
similarities = np.dot(doc_embeddings, query_embedding)
|
||||
|
||||
else:
|
||||
@@ -258,17 +199,8 @@ class BGEM3Embedder:
|
||||
query_sparse: Dict[int, float],
|
||||
doc_sparse: Dict[int, float]
|
||||
) -> float:
|
||||
"""
|
||||
计算Sparse向量的相似度(BM25风格)
|
||||
|
||||
Args:
|
||||
query_sparse: 查询的Sparse向量(词ID -> 权重)
|
||||
doc_sparse: 文档的Sparse向量
|
||||
|
||||
Returns:
|
||||
float: 相似度分数
|
||||
"""
|
||||
# 计算交集词的点积
|
||||
"""Handle sparse similarity for the B G E M3 Embedder instance."""
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
common_keys = set(query_sparse.keys()) & set(doc_sparse.keys())
|
||||
|
||||
score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys)
|
||||
@@ -280,7 +212,7 @@ def embed_texts(
|
||||
model_name: str = "BAAI/bge-m3",
|
||||
**kwargs
|
||||
) -> EmbeddingResult:
|
||||
"""便捷函数:对文本列表生成嵌入"""
|
||||
"""Embed texts."""
|
||||
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
|
||||
return embedder.embed(texts)
|
||||
|
||||
@@ -290,6 +222,6 @@ def embed_single_text(
|
||||
model_name: str = "BAAI/bge-m3",
|
||||
**kwargs
|
||||
) -> Dict:
|
||||
"""便捷函数:对单个文本生成嵌入"""
|
||||
"""Embed single text."""
|
||||
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
|
||||
return embedder.embed_single(text)
|
||||
|
||||
@@ -1,51 +1,46 @@
|
||||
"""智能分块器 - 章节级+条款级双粒度切割"""
|
||||
"""Provide service-layer logic for text chunker."""
|
||||
|
||||
import re
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkMetadata:
|
||||
"""分块元数据"""
|
||||
"""Represent the Chunk Metadata type."""
|
||||
doc_id: str = ""
|
||||
doc_name: str = ""
|
||||
chunk_id: str = ""
|
||||
section_number: str = "" # 章节编号(如 "第一章")
|
||||
section_title: str = "" # 章节标题
|
||||
clause_number: str = "" # 条款编号(如 "第一条")
|
||||
section_number: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
section_title: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
clause_number: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
page_number: int = 0
|
||||
start_position: int = 0 # 在原文中的起始位置
|
||||
end_position: int = 0 # 在原文中的结束位置
|
||||
regulation_type: str = "" # 法规类型
|
||||
start_position: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
end_position: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
regulation_type: str = "" # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
version: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextChunk:
|
||||
"""文本分块"""
|
||||
"""Represent the Text Chunk type."""
|
||||
content: str
|
||||
metadata: ChunkMetadata
|
||||
token_count: int = 0 # 估算的token数量
|
||||
token_count: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
|
||||
|
||||
class RegulationChunker:
|
||||
"""
|
||||
法规文档智能分块器
|
||||
"""Represent the Regulation Chunker type."""
|
||||
|
||||
实现章节级/条款级双粒度切割,适配国标GB文档结构:
|
||||
- 国标文档通常有明确的层级结构:章 > 节 > 条
|
||||
- 每个条款应作为一个独立的语义单元
|
||||
- 保留条款完整性,避免跨条款截断
|
||||
"""
|
||||
|
||||
# 法规标题模式
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s+[^\n]+')
|
||||
SECTION_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+节\s+[^\n]+')
|
||||
CLAUSE_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+条\s')
|
||||
|
||||
# 条款子项模式
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
SUB_ITEM_PATTERN = re.compile(r'^[\((][一二三四五六七八九十]+[\))]\s')
|
||||
NUMBER_ITEM_PATTERN = re.compile(r'^[\d]+[\.、]\s')
|
||||
|
||||
@@ -56,15 +51,7 @@ class RegulationChunker:
|
||||
max_chunk_size: int = 2048,
|
||||
min_chunk_size: int = 100
|
||||
):
|
||||
"""
|
||||
初始化分块器
|
||||
|
||||
Args:
|
||||
chunk_size: 默认分块大小(字符数)
|
||||
chunk_overlap: 分块重叠大小
|
||||
max_chunk_size: 最大分块大小(防止单个条款过长)
|
||||
min_chunk_size: 最小分块大小(防止碎片化)
|
||||
"""
|
||||
"""Initialize the Regulation Chunker instance."""
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.max_chunk_size = max_chunk_size
|
||||
@@ -78,30 +65,18 @@ class RegulationChunker:
|
||||
regulation_type: str = "",
|
||||
version: str = ""
|
||||
) -> List[TextChunk]:
|
||||
"""
|
||||
对法规文档进行智能分块
|
||||
|
||||
Args:
|
||||
markdown_text: Markdown格式的文档内容
|
||||
doc_id: 文档ID
|
||||
doc_name: 文档名称
|
||||
regulation_type: 法规类型
|
||||
version: 文档版本
|
||||
|
||||
Returns:
|
||||
List[TextChunk]: 分块列表
|
||||
"""
|
||||
"""Handle chunk document for the Regulation Chunker instance."""
|
||||
logger.info(f"开始分块文档: {doc_name}")
|
||||
|
||||
# 1. 按章节分割(一级分块)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
sections = self._split_by_sections(markdown_text)
|
||||
|
||||
# 2. 在每个章节内按条款分割(二级分块)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
chunks = []
|
||||
global_position = 0
|
||||
|
||||
for section_num, section_title, section_content, section_start in sections:
|
||||
# 在章节内按条款分割
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
clause_chunks = self._split_by_clauses(
|
||||
section_content,
|
||||
section_num,
|
||||
@@ -110,7 +85,7 @@ class RegulationChunker:
|
||||
)
|
||||
|
||||
for chunk_content, clause_num, clause_title, start_pos, end_pos in clause_chunks:
|
||||
# 处理过长的条款(进一步细分)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if len(chunk_content) > self.max_chunk_size:
|
||||
sub_chunks = self._split_long_clause(
|
||||
chunk_content,
|
||||
@@ -150,12 +125,7 @@ class RegulationChunker:
|
||||
return chunks
|
||||
|
||||
def _split_by_sections(self, markdown_text: str) -> List[Tuple[str, str, str, int]]:
|
||||
"""
|
||||
按章节分割文档
|
||||
|
||||
Returns:
|
||||
List of (section_number, section_title, section_content, start_position)
|
||||
"""
|
||||
"""Handle split by sections for this module for the Regulation Chunker instance."""
|
||||
sections = []
|
||||
lines = markdown_text.split('\n')
|
||||
|
||||
@@ -165,12 +135,12 @@ class RegulationChunker:
|
||||
current_section_start = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# 检测章节标题
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
chapter_match = self.CHAPTER_PATTERN.match(line.strip())
|
||||
section_match = self.SECTION_PATTERN.match(line.strip())
|
||||
|
||||
if chapter_match or section_match:
|
||||
# 保存上一个章节
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if current_section_content:
|
||||
content = '\n'.join(current_section_content)
|
||||
sections.append((
|
||||
@@ -180,7 +150,7 @@ class RegulationChunker:
|
||||
current_section_start
|
||||
))
|
||||
|
||||
# 开始新章节
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
current_section_start = sum(len(l) + 1 for l in lines[:i])
|
||||
current_section_content = []
|
||||
|
||||
@@ -193,7 +163,7 @@ class RegulationChunker:
|
||||
|
||||
current_section_content.append(line)
|
||||
|
||||
# 保存最后一个章节
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if current_section_content:
|
||||
content = '\n'.join(current_section_content)
|
||||
sections.append((
|
||||
@@ -203,7 +173,7 @@ class RegulationChunker:
|
||||
current_section_start
|
||||
))
|
||||
|
||||
# 如果没有检测到章节,将整个文档作为一个大章节
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if not sections:
|
||||
sections.append((
|
||||
"",
|
||||
@@ -221,12 +191,7 @@ class RegulationChunker:
|
||||
section_title: str,
|
||||
section_start: int
|
||||
) -> List[Tuple[str, str, str, int, int]]:
|
||||
"""
|
||||
在章节内按条款分割
|
||||
|
||||
Returns:
|
||||
List of (content, clause_number, clause_title, start_position, end_position)
|
||||
"""
|
||||
"""Handle split by clauses for this module for the Regulation Chunker instance."""
|
||||
clauses = []
|
||||
lines = section_content.split('\n')
|
||||
|
||||
@@ -236,11 +201,11 @@ class RegulationChunker:
|
||||
current_clause_start = section_start
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# 检测条款标题
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
clause_match = self.CLAUSE_PATTERN.match(line.strip())
|
||||
|
||||
if clause_match:
|
||||
# 保存上一个条款
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if current_clause_content:
|
||||
content = '\n'.join(current_clause_content)
|
||||
end_pos = current_clause_start + len(content)
|
||||
@@ -252,7 +217,7 @@ class RegulationChunker:
|
||||
end_pos
|
||||
))
|
||||
|
||||
# 开始新条款
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
current_clause_start = section_start + sum(len(l) + 1 for l in lines[:i])
|
||||
current_clause_content = []
|
||||
current_clause_num = self._extract_clause_number(line.strip())
|
||||
@@ -260,7 +225,7 @@ class RegulationChunker:
|
||||
|
||||
current_clause_content.append(line)
|
||||
|
||||
# 保存最后一个条款
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if current_clause_content:
|
||||
content = '\n'.join(current_clause_content)
|
||||
end_pos = current_clause_start + len(content)
|
||||
@@ -272,7 +237,7 @@ class RegulationChunker:
|
||||
end_pos
|
||||
))
|
||||
|
||||
# 如果没有检测到条款,将整个章节作为一个条款
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if not clauses:
|
||||
clauses.append((
|
||||
section_content,
|
||||
@@ -290,15 +255,11 @@ class RegulationChunker:
|
||||
clause_num: str,
|
||||
clause_title: str
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
"""
|
||||
分割过长的条款内容
|
||||
|
||||
按条款子项或段落分割,保持语义完整性
|
||||
"""
|
||||
"""Handle split long clause for this module for the Regulation Chunker instance."""
|
||||
sub_chunks = []
|
||||
lines = content.split('\n')
|
||||
|
||||
# 检测是否有子项结构
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
has_sub_items = any(
|
||||
self.SUB_ITEM_PATTERN.match(line.strip()) or
|
||||
self.NUMBER_ITEM_PATTERN.match(line.strip())
|
||||
@@ -306,7 +267,7 @@ class RegulationChunker:
|
||||
)
|
||||
|
||||
if has_sub_items:
|
||||
# 按子项分割
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
current_sub_content = []
|
||||
current_sub_start = 0
|
||||
|
||||
@@ -326,14 +287,14 @@ class RegulationChunker:
|
||||
|
||||
current_sub_content.append(line)
|
||||
|
||||
# 保存最后一个子项
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if current_sub_content:
|
||||
sub_content = '\n'.join(current_sub_content)
|
||||
sub_end = current_sub_start + len(sub_content)
|
||||
sub_chunks.append((sub_content, current_sub_start, sub_end))
|
||||
|
||||
else:
|
||||
# 按段落分割(滑动窗口)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
paragraphs = []
|
||||
current_para = []
|
||||
|
||||
@@ -348,7 +309,7 @@ class RegulationChunker:
|
||||
if current_para:
|
||||
paragraphs.append('\n'.join(current_para))
|
||||
|
||||
# 合并段落直到达到chunk_size
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
chunk_start = 0
|
||||
@@ -365,7 +326,7 @@ class RegulationChunker:
|
||||
current_chunk.append(para)
|
||||
current_length += len(para)
|
||||
|
||||
# 保存最后一个chunk
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if current_chunk:
|
||||
chunk_content = '\n'.join(current_chunk)
|
||||
chunk_end = chunk_start + len(chunk_content)
|
||||
@@ -374,13 +335,13 @@ class RegulationChunker:
|
||||
return sub_chunks
|
||||
|
||||
def _extract_title(self, header_line: str) -> str:
|
||||
"""从标题行提取标题内容"""
|
||||
# 移除"第X章"、"第X节"前缀
|
||||
"""Handle extract title for this module for the Regulation Chunker instance."""
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
title = re.sub(r'^第[一二三四五六七八九十百]+[章节]\s+', '', header_line)
|
||||
return title.strip()
|
||||
|
||||
def _extract_clause_number(self, clause_line: str) -> str:
|
||||
"""从条款行提取条款编号"""
|
||||
"""Handle extract clause number for this module for the Regulation Chunker instance."""
|
||||
match = self.CLAUSE_PATTERN.match(clause_line)
|
||||
if match:
|
||||
return match.group(0).strip()
|
||||
@@ -399,14 +360,14 @@ class RegulationChunker:
|
||||
regulation_type: str,
|
||||
version: str
|
||||
) -> TextChunk:
|
||||
"""创建文本分块"""
|
||||
# 清理内容
|
||||
"""Handle create chunk for this module for the Regulation Chunker instance."""
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
content = content.strip()
|
||||
|
||||
# 计算估算token数(中文约1.5字符/token)
|
||||
token_count = int(len(content) * 0.7) # 简化估算
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
token_count = int(len(content) * 0.7) # Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
|
||||
# 生成chunk_id
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
chunk_id = f"{doc_id}_{section_num}_{clause_num}_{start_pos}"
|
||||
|
||||
metadata = ChunkMetadata(
|
||||
@@ -437,7 +398,7 @@ def chunk_regulation_document(
|
||||
version: str = "",
|
||||
chunk_size: int = 512
|
||||
) -> List[TextChunk]:
|
||||
"""便捷函数:对法规文档进行分块"""
|
||||
"""Handle chunk regulation document."""
|
||||
chunker = RegulationChunker(chunk_size=chunk_size)
|
||||
return chunker.chunk_document(
|
||||
markdown_text,
|
||||
|
||||
Reference in New Issue
Block a user