Files
AIRegulation-DocAnalysis/backend/app/services/embedding/bge_m3_embedder.py

296 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成"""
import numpy as np
from typing import List, Dict, Optional, Union
from dataclasses import dataclass, field
from loguru import logger
import torch
import os
# 设置HuggingFace镜像国内网络
if 'HF_ENDPOINT' not in os.environ:
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 本地模型路径(按优先级检查)
LOCAL_MODEL_PATHS = [
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # ModelScope下载路径
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # HuggingFace本地路径
]
@dataclass
class EmbeddingResult:
"""嵌入结果"""
dense_embeddings: np.ndarray # Dense向量语义检索
sparse_embeddings: List[Dict[int, float]] # Sparse向量关键词匹配
texts: List[str]
dim: int = 1024
class BGEM3Embedder:
"""
BGE-M3多语言嵌入模型服务
BGE-M3是BAAI发布的多语言嵌入模型支持
- Dense向量用于语义相似度检索
- Sparse向量用于关键词精确匹配BM25风格
- ColBERT向量用于细粒度交互匹配可选
特点:
- 支持100+语言(中英双语优化)
- 8192 tokens超长上下文
- Dense+Sparse双路检索能力
GitHub: https://github.com/FlagOpen/FlagEmbedding
"""
def __init__(
self,
model_name: str = "BAAI/bge-m3",
use_fp16: bool = True,
device: Optional[str] = None,
batch_size: int = 12,
max_length: int = 8192,
local_model_path: Optional[str] = None
):
"""
初始化BGE-M3嵌入模型
Args:
model_name: 模型名称(如果使用本地路径,此参数会被忽略)
use_fp16: 是否使用FP16加速
device: 设备类型cuda/cpu默认自动选择
batch_size: 批处理大小
max_length: 最大序列长度
local_model_path: 本地模型路径(可选,优先使用)
"""
self.use_fp16 = use_fp16
self.batch_size = batch_size
self.max_length = max_length
# 确定模型路径(优先使用本地路径)
if local_model_path and os.path.exists(local_model_path):
self.model_path = local_model_path
self.model_name = "local"
logger.info(f"使用本地模型路径: {local_model_path}")
else:
# 检查多个可能的本地路径
found_local = False
for path in LOCAL_MODEL_PATHS:
if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")):
self.model_path = path
self.model_name = "local"
logger.info(f"使用本地模型路径: {path}")
found_local = True
break
if not found_local:
self.model_path = model_name
self.model_name = model_name
logger.info(f"使用远程模型: {model_name}")
# 自动选择设备
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
self.device = device
logger.info(f"初始化BGE-M3模型, 设备: {self.device}")
self.model = None
self._load_model()
def _load_model(self):
"""加载嵌入模型"""
try:
from FlagEmbedding import BGEM3FlagModel
self.model = BGEM3FlagModel(
self.model_path,
use_fp16=self.use_fp16,
device=self.device
)
logger.success(f"BGE-M3模型加载成功")
except ImportError:
logger.warning("FlagEmbedding库未安装请运行: pip install FlagEmbedding")
raise
except Exception as e:
logger.error(f"模型加载失败: {e}")
raise
def embed(
self,
texts: List[str],
return_dense: bool = True,
return_sparse: bool = True,
return_colbert_vecs: bool = False
) -> EmbeddingResult:
"""
对文本列表生成嵌入向量
Args:
texts: 文本列表
return_dense: 是否返回Dense向量
return_sparse: 是否返回Sparse向量
return_colbert_vecs: 是否返回ColBERT向量
Returns:
EmbeddingResult: 嵌入结果
"""
if not texts:
logger.warning("输入文本列表为空")
return EmbeddingResult(
dense_embeddings=np.array([]),
sparse_embeddings=[],
texts=[],
dim=0
)
logger.info(f"开始嵌入{len(texts)}个文本块")
try:
# 执行嵌入
embeddings = self.model.encode(
texts,
batch_size=self.batch_size,
max_length=self.max_length,
return_dense=return_dense,
return_sparse=return_sparse,
return_colbert_vecs=return_colbert_vecs
)
# 提取结果
dense_embeddings = embeddings.get('dense_vecs', np.array([]))
sparse_embeddings = embeddings.get('lexical_weights', [])
# 获取维度
dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024
logger.success(f"嵌入完成,向量维度: {dim}")
return EmbeddingResult(
dense_embeddings=dense_embeddings,
sparse_embeddings=sparse_embeddings,
texts=texts,
dim=dim
)
except Exception as e:
logger.error(f"嵌入失败: {e}")
raise
def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]:
"""
对单个文本生成嵌入向量
Args:
text: 输入文本
Returns:
Dict: 包含dense和sparse向量
"""
result = self.embed([text])
return {
'dense': result.dense_embeddings[0],
'sparse': result.sparse_embeddings[0] if result.sparse_embeddings else {},
'dim': result.dim
}
def embed_dense(self, texts: List[str]) -> np.ndarray:
"""只生成Dense向量"""
result = self.embed(texts, return_sparse=False, return_colbert_vecs=False)
return result.dense_embeddings
def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]:
"""只生成Sparse向量"""
result = self.embed(texts, return_dense=False, return_colbert_vecs=False)
return result.sparse_embeddings
def embed_query(self, query: str) -> Dict:
"""
对查询文本生成嵌入(用于检索)
Args:
query: 查询文本
Returns:
Dict: 包含dense和sparse向量
"""
return self.embed_single(query)
def compute_similarity(
self,
query_embedding: np.ndarray,
doc_embeddings: np.ndarray,
metric: str = "cosine"
) -> np.ndarray:
"""
计算查询与文档的相似度
Args:
query_embedding: 查询向量
doc_embeddings: 文档向量矩阵
metric: 相似度度量cosine/dot
Returns:
np.ndarray: 相似度分数数组
"""
if metric == "cosine":
# 余弦相似度
query_norm = np.linalg.norm(query_embedding)
doc_norms = np.linalg.norm(doc_embeddings, axis=1)
similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm)
elif metric == "dot":
# 点积相似度
similarities = np.dot(doc_embeddings, query_embedding)
else:
raise ValueError(f"不支持的相似度度量: {metric}")
return similarities
def sparse_similarity(
self,
query_sparse: Dict[int, float],
doc_sparse: Dict[int, float]
) -> float:
"""
计算Sparse向量的相似度BM25风格
Args:
query_sparse: 查询的Sparse向量词ID -> 权重)
doc_sparse: 文档的Sparse向量
Returns:
float: 相似度分数
"""
# 计算交集词的点积
common_keys = set(query_sparse.keys()) & set(doc_sparse.keys())
score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys)
return score
def embed_texts(
texts: List[str],
model_name: str = "BAAI/bge-m3",
**kwargs
) -> EmbeddingResult:
"""便捷函数:对文本列表生成嵌入"""
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
return embedder.embed(texts)
def embed_single_text(
text: str,
model_name: str = "BAAI/bge-m3",
**kwargs
) -> Dict:
"""便捷函数:对单个文本生成嵌入"""
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
return embedder.embed_single(text)