This commit is contained in:
2026-05-14 15:07:34 +08:00
parent c2a398930d
commit 10d04c4083
179 changed files with 24073 additions and 1243 deletions

View File

@@ -0,0 +1,7 @@
# src/services/embedding/__init__.py
"""嵌入和分块服务"""
from .text_chunker import RegulationChunker
from .bge_m3_embedder import BGEM3Embedder
__all__ = ["RegulationChunker", "BGEM3Embedder"]

View File

@@ -0,0 +1,296 @@
# src/services/embedding/bge_m3_embedder.py
"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成"""
import numpy as np
from typing import List, Dict, Optional, Union
from dataclasses import dataclass, field
from loguru import logger
import torch
import os
# 设置HuggingFace镜像国内网络
if 'HF_ENDPOINT' not in os.environ:
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 本地模型路径(按优先级检查)
LOCAL_MODEL_PATHS = [
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # ModelScope下载路径
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # HuggingFace本地路径
]
@dataclass
class EmbeddingResult:
"""嵌入结果"""
dense_embeddings: np.ndarray # Dense向量语义检索
sparse_embeddings: List[Dict[int, float]] # Sparse向量关键词匹配
texts: List[str]
dim: int = 1024
class BGEM3Embedder:
"""
BGE-M3多语言嵌入模型服务
BGE-M3是BAAI发布的多语言嵌入模型支持
- Dense向量用于语义相似度检索
- Sparse向量用于关键词精确匹配BM25风格
- ColBERT向量用于细粒度交互匹配可选
特点:
- 支持100+语言(中英双语优化)
- 8192 tokens超长上下文
- Dense+Sparse双路检索能力
GitHub: https://github.com/FlagOpen/FlagEmbedding
"""
def __init__(
self,
model_name: str = "BAAI/bge-m3",
use_fp16: bool = True,
device: Optional[str] = None,
batch_size: int = 12,
max_length: int = 8192,
local_model_path: Optional[str] = None
):
"""
初始化BGE-M3嵌入模型
Args:
model_name: 模型名称(如果使用本地路径,此参数会被忽略)
use_fp16: 是否使用FP16加速
device: 设备类型cuda/cpu默认自动选择
batch_size: 批处理大小
max_length: 最大序列长度
local_model_path: 本地模型路径(可选,优先使用)
"""
self.use_fp16 = use_fp16
self.batch_size = batch_size
self.max_length = max_length
# 确定模型路径(优先使用本地路径)
if local_model_path and os.path.exists(local_model_path):
self.model_path = local_model_path
self.model_name = "local"
logger.info(f"使用本地模型路径: {local_model_path}")
else:
# 检查多个可能的本地路径
found_local = False
for path in LOCAL_MODEL_PATHS:
if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")):
self.model_path = path
self.model_name = "local"
logger.info(f"使用本地模型路径: {path}")
found_local = True
break
if not found_local:
self.model_path = model_name
self.model_name = model_name
logger.info(f"使用远程模型: {model_name}")
# 自动选择设备
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
self.device = device
logger.info(f"初始化BGE-M3模型, 设备: {self.device}")
self.model = None
self._load_model()
def _load_model(self):
"""加载嵌入模型"""
try:
from FlagEmbedding import BGEM3FlagModel
self.model = BGEM3FlagModel(
self.model_path,
use_fp16=self.use_fp16,
device=self.device
)
logger.success(f"BGE-M3模型加载成功")
except ImportError:
logger.warning("FlagEmbedding库未安装请运行: pip install FlagEmbedding")
raise
except Exception as e:
logger.error(f"模型加载失败: {e}")
raise
def embed(
self,
texts: List[str],
return_dense: bool = True,
return_sparse: bool = True,
return_colbert_vecs: bool = False
) -> EmbeddingResult:
"""
对文本列表生成嵌入向量
Args:
texts: 文本列表
return_dense: 是否返回Dense向量
return_sparse: 是否返回Sparse向量
return_colbert_vecs: 是否返回ColBERT向量
Returns:
EmbeddingResult: 嵌入结果
"""
if not texts:
logger.warning("输入文本列表为空")
return EmbeddingResult(
dense_embeddings=np.array([]),
sparse_embeddings=[],
texts=[],
dim=0
)
logger.info(f"开始嵌入{len(texts)}个文本块")
try:
# 执行嵌入
embeddings = self.model.encode(
texts,
batch_size=self.batch_size,
max_length=self.max_length,
return_dense=return_dense,
return_sparse=return_sparse,
return_colbert_vecs=return_colbert_vecs
)
# 提取结果
dense_embeddings = embeddings.get('dense_vecs', np.array([]))
sparse_embeddings = embeddings.get('lexical_weights', [])
# 获取维度
dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024
logger.success(f"嵌入完成,向量维度: {dim}")
return EmbeddingResult(
dense_embeddings=dense_embeddings,
sparse_embeddings=sparse_embeddings,
texts=texts,
dim=dim
)
except Exception as e:
logger.error(f"嵌入失败: {e}")
raise
def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]:
"""
对单个文本生成嵌入向量
Args:
text: 输入文本
Returns:
Dict: 包含dense和sparse向量
"""
result = self.embed([text])
return {
'dense': result.dense_embeddings[0],
'sparse': result.sparse_embeddings[0] if result.sparse_embeddings else {},
'dim': result.dim
}
def embed_dense(self, texts: List[str]) -> np.ndarray:
"""只生成Dense向量"""
result = self.embed(texts, return_sparse=False, return_colbert_vecs=False)
return result.dense_embeddings
def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]:
"""只生成Sparse向量"""
result = self.embed(texts, return_dense=False, return_colbert_vecs=False)
return result.sparse_embeddings
def embed_query(self, query: str) -> Dict:
"""
对查询文本生成嵌入(用于检索)
Args:
query: 查询文本
Returns:
Dict: 包含dense和sparse向量
"""
return self.embed_single(query)
def compute_similarity(
self,
query_embedding: np.ndarray,
doc_embeddings: np.ndarray,
metric: str = "cosine"
) -> np.ndarray:
"""
计算查询与文档的相似度
Args:
query_embedding: 查询向量
doc_embeddings: 文档向量矩阵
metric: 相似度度量cosine/dot
Returns:
np.ndarray: 相似度分数数组
"""
if metric == "cosine":
# 余弦相似度
query_norm = np.linalg.norm(query_embedding)
doc_norms = np.linalg.norm(doc_embeddings, axis=1)
similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm)
elif metric == "dot":
# 点积相似度
similarities = np.dot(doc_embeddings, query_embedding)
else:
raise ValueError(f"不支持的相似度度量: {metric}")
return similarities
def sparse_similarity(
self,
query_sparse: Dict[int, float],
doc_sparse: Dict[int, float]
) -> float:
"""
计算Sparse向量的相似度BM25风格
Args:
query_sparse: 查询的Sparse向量词ID -> 权重)
doc_sparse: 文档的Sparse向量
Returns:
float: 相似度分数
"""
# 计算交集词的点积
common_keys = set(query_sparse.keys()) & set(doc_sparse.keys())
score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys)
return score
def embed_texts(
texts: List[str],
model_name: str = "BAAI/bge-m3",
**kwargs
) -> EmbeddingResult:
"""便捷函数:对文本列表生成嵌入"""
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
return embedder.embed(texts)
def embed_single_text(
text: str,
model_name: str = "BAAI/bge-m3",
**kwargs
) -> Dict:
"""便捷函数:对单个文本生成嵌入"""
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
return embedder.embed_single(text)

View File

@@ -0,0 +1,449 @@
# src/services/embedding/text_chunker.py
"""智能分块器 - 章节级+条款级双粒度切割"""
import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, field
from loguru import logger
@dataclass
class ChunkMetadata:
"""分块元数据"""
doc_id: str = ""
doc_name: str = ""
chunk_id: str = ""
section_number: str = "" # 章节编号(如 "第一章"
section_title: str = "" # 章节标题
clause_number: str = "" # 条款编号(如 "第一条"
page_number: int = 0
start_position: int = 0 # 在原文中的起始位置
end_position: int = 0 # 在原文中的结束位置
regulation_type: str = "" # 法规类型
version: str = ""
@dataclass
class TextChunk:
"""文本分块"""
content: str
metadata: ChunkMetadata
token_count: int = 0 # 估算的token数量
class RegulationChunker:
"""
法规文档智能分块器
实现章节级/条款级双粒度切割适配国标GB文档结构
- 国标文档通常有明确的层级结构:章 > 节 > 条
- 每个条款应作为一个独立的语义单元
- 保留条款完整性,避免跨条款截断
"""
# 法规标题模式
CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s+[^\n]+')
SECTION_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+节\s+[^\n]+')
CLAUSE_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+条\s')
# 条款子项模式
SUB_ITEM_PATTERN = re.compile(r'^[\(][一二三四五六七八九十]+[\)]\s')
NUMBER_ITEM_PATTERN = re.compile(r'^[\d]+[\.、]\s')
def __init__(
self,
chunk_size: int = 512,
chunk_overlap: int = 50,
max_chunk_size: int = 2048,
min_chunk_size: int = 100
):
"""
初始化分块器
Args:
chunk_size: 默认分块大小(字符数)
chunk_overlap: 分块重叠大小
max_chunk_size: 最大分块大小(防止单个条款过长)
min_chunk_size: 最小分块大小(防止碎片化)
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.max_chunk_size = max_chunk_size
self.min_chunk_size = min_chunk_size
def chunk_document(
self,
markdown_text: str,
doc_id: str = "",
doc_name: str = "",
regulation_type: str = "",
version: str = ""
) -> List[TextChunk]:
"""
对法规文档进行智能分块
Args:
markdown_text: Markdown格式的文档内容
doc_id: 文档ID
doc_name: 文档名称
regulation_type: 法规类型
version: 文档版本
Returns:
List[TextChunk]: 分块列表
"""
logger.info(f"开始分块文档: {doc_name}")
# 1. 按章节分割(一级分块)
sections = self._split_by_sections(markdown_text)
# 2. 在每个章节内按条款分割(二级分块)
chunks = []
global_position = 0
for section_num, section_title, section_content, section_start in sections:
# 在章节内按条款分割
clause_chunks = self._split_by_clauses(
section_content,
section_num,
section_title,
section_start + global_position
)
for chunk_content, clause_num, clause_title, start_pos, end_pos in clause_chunks:
# 处理过长的条款(进一步细分)
if len(chunk_content) > self.max_chunk_size:
sub_chunks = self._split_long_clause(
chunk_content,
clause_num,
clause_title
)
for sub_content, sub_start, sub_end in sub_chunks:
chunk = self._create_chunk(
sub_content,
doc_id,
doc_name,
section_num,
section_title,
clause_num,
sub_start + start_pos,
sub_end + start_pos,
regulation_type,
version
)
chunks.append(chunk)
else:
chunk = self._create_chunk(
chunk_content,
doc_id,
doc_name,
section_num,
section_title,
clause_num,
start_pos,
end_pos,
regulation_type,
version
)
chunks.append(chunk)
logger.success(f"分块完成,共{len(chunks)}个chunk")
return chunks
def _split_by_sections(self, markdown_text: str) -> List[Tuple[str, str, str, int]]:
"""
按章节分割文档
Returns:
List of (section_number, section_title, section_content, start_position)
"""
sections = []
lines = markdown_text.split('\n')
current_section_num = ""
current_section_title = ""
current_section_content = []
current_section_start = 0
for i, line in enumerate(lines):
# 检测章节标题
chapter_match = self.CHAPTER_PATTERN.match(line.strip())
section_match = self.SECTION_PATTERN.match(line.strip())
if chapter_match or section_match:
# 保存上一个章节
if current_section_content:
content = '\n'.join(current_section_content)
sections.append((
current_section_num,
current_section_title,
content,
current_section_start
))
# 开始新章节
current_section_start = sum(len(l) + 1 for l in lines[:i])
current_section_content = []
if chapter_match:
current_section_num = line.strip()
current_section_title = self._extract_title(line.strip())
else:
current_section_num = line.strip()
current_section_title = self._extract_title(line.strip())
current_section_content.append(line)
# 保存最后一个章节
if current_section_content:
content = '\n'.join(current_section_content)
sections.append((
current_section_num,
current_section_title,
content,
current_section_start
))
# 如果没有检测到章节,将整个文档作为一个大章节
if not sections:
sections.append((
"",
"全文",
markdown_text,
0
))
return sections
def _split_by_clauses(
self,
section_content: str,
section_num: str,
section_title: str,
section_start: int
) -> List[Tuple[str, str, str, int, int]]:
"""
在章节内按条款分割
Returns:
List of (content, clause_number, clause_title, start_position, end_position)
"""
clauses = []
lines = section_content.split('\n')
current_clause_num = ""
current_clause_title = ""
current_clause_content = []
current_clause_start = section_start
for i, line in enumerate(lines):
# 检测条款标题
clause_match = self.CLAUSE_PATTERN.match(line.strip())
if clause_match:
# 保存上一个条款
if current_clause_content:
content = '\n'.join(current_clause_content)
end_pos = current_clause_start + len(content)
clauses.append((
content,
current_clause_num,
current_clause_title,
current_clause_start,
end_pos
))
# 开始新条款
current_clause_start = section_start + sum(len(l) + 1 for l in lines[:i])
current_clause_content = []
current_clause_num = self._extract_clause_number(line.strip())
current_clause_title = line.strip()
current_clause_content.append(line)
# 保存最后一个条款
if current_clause_content:
content = '\n'.join(current_clause_content)
end_pos = current_clause_start + len(content)
clauses.append((
content,
current_clause_num,
current_clause_title,
current_clause_start,
end_pos
))
# 如果没有检测到条款,将整个章节作为一个条款
if not clauses:
clauses.append((
section_content,
"",
section_title,
section_start,
section_start + len(section_content)
))
return clauses
def _split_long_clause(
self,
content: str,
clause_num: str,
clause_title: str
) -> List[Tuple[str, int, int]]:
"""
分割过长的条款内容
按条款子项或段落分割,保持语义完整性
"""
sub_chunks = []
lines = content.split('\n')
# 检测是否有子项结构
has_sub_items = any(
self.SUB_ITEM_PATTERN.match(line.strip()) or
self.NUMBER_ITEM_PATTERN.match(line.strip())
for line in lines
)
if has_sub_items:
# 按子项分割
current_sub_content = []
current_sub_start = 0
for i, line in enumerate(lines):
is_sub_item = (
self.SUB_ITEM_PATTERN.match(line.strip()) or
self.NUMBER_ITEM_PATTERN.match(line.strip())
)
if is_sub_item and current_sub_content:
sub_content = '\n'.join(current_sub_content)
sub_end = current_sub_start + len(sub_content)
if len(sub_content) >= self.min_chunk_size:
sub_chunks.append((sub_content, current_sub_start, sub_end))
current_sub_content = []
current_sub_start = sum(len(l) + 1 for l in lines[:i])
current_sub_content.append(line)
# 保存最后一个子项
if current_sub_content:
sub_content = '\n'.join(current_sub_content)
sub_end = current_sub_start + len(sub_content)
sub_chunks.append((sub_content, current_sub_start, sub_end))
else:
# 按段落分割(滑动窗口)
paragraphs = []
current_para = []
for line in lines:
if line.strip():
current_para.append(line)
else:
if current_para:
paragraphs.append('\n'.join(current_para))
current_para = []
if current_para:
paragraphs.append('\n'.join(current_para))
# 合并段落直到达到chunk_size
current_chunk = []
current_length = 0
chunk_start = 0
for para in paragraphs:
if current_length + len(para) > self.chunk_size and current_chunk:
chunk_content = '\n'.join(current_chunk)
chunk_end = chunk_start + len(chunk_content)
sub_chunks.append((chunk_content, chunk_start, chunk_end))
current_chunk = []
current_length = 0
chunk_start = chunk_end
current_chunk.append(para)
current_length += len(para)
# 保存最后一个chunk
if current_chunk:
chunk_content = '\n'.join(current_chunk)
chunk_end = chunk_start + len(chunk_content)
sub_chunks.append((chunk_content, chunk_start, chunk_end))
return sub_chunks
def _extract_title(self, header_line: str) -> str:
"""从标题行提取标题内容"""
# 移除"第X章"、"第X节"前缀
title = re.sub(r'^第[一二三四五六七八九十百]+[章节]\s+', '', header_line)
return title.strip()
def _extract_clause_number(self, clause_line: str) -> str:
"""从条款行提取条款编号"""
match = self.CLAUSE_PATTERN.match(clause_line)
if match:
return match.group(0).strip()
return ""
def _create_chunk(
self,
content: str,
doc_id: str,
doc_name: str,
section_num: str,
section_title: str,
clause_num: str,
start_pos: int,
end_pos: int,
regulation_type: str,
version: str
) -> TextChunk:
"""创建文本分块"""
# 清理内容
content = content.strip()
# 计算估算token数中文约1.5字符/token
token_count = int(len(content) * 0.7) # 简化估算
# 生成chunk_id
chunk_id = f"{doc_id}_{section_num}_{clause_num}_{start_pos}"
metadata = ChunkMetadata(
doc_id=doc_id,
doc_name=doc_name,
chunk_id=chunk_id,
section_number=section_num,
section_title=section_title,
clause_number=clause_num,
start_position=start_pos,
end_position=end_pos,
regulation_type=regulation_type,
version=version
)
return TextChunk(
content=content,
metadata=metadata,
token_count=token_count
)
def chunk_regulation_document(
markdown_text: str,
doc_id: str = "",
doc_name: str = "",
regulation_type: str = "",
version: str = "",
chunk_size: int = 512
) -> List[TextChunk]:
"""便捷函数:对法规文档进行分块"""
chunker = RegulationChunker(chunk_size=chunk_size)
return chunker.chunk_document(
markdown_text,
doc_id,
doc_name,
regulation_type,
version
)