296 lines
9.0 KiB
Python
296 lines
9.0 KiB
Python
# src/services/embedding/bge_m3_embedder.py
|
||
"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成"""
|
||
|
||
import numpy as np
|
||
from typing import List, Dict, Optional, Union
|
||
from dataclasses import dataclass, field
|
||
from loguru import logger
|
||
import torch
|
||
import os
|
||
|
||
# 设置HuggingFace镜像(国内网络)
|
||
if 'HF_ENDPOINT' not in os.environ:
|
||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||
|
||
# 本地模型路径(按优先级检查)
|
||
LOCAL_MODEL_PATHS = [
|
||
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # ModelScope下载路径
|
||
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # HuggingFace本地路径
|
||
]
|
||
|
||
|
||
@dataclass
|
||
class EmbeddingResult:
|
||
"""嵌入结果"""
|
||
dense_embeddings: np.ndarray # Dense向量(语义检索)
|
||
sparse_embeddings: List[Dict[int, float]] # Sparse向量(关键词匹配)
|
||
texts: List[str]
|
||
dim: int = 1024
|
||
|
||
|
||
class BGEM3Embedder:
|
||
"""
|
||
BGE-M3多语言嵌入模型服务
|
||
|
||
BGE-M3是BAAI发布的多语言嵌入模型,支持:
|
||
- Dense向量:用于语义相似度检索
|
||
- Sparse向量:用于关键词精确匹配(BM25风格)
|
||
- ColBERT向量:用于细粒度交互匹配(可选)
|
||
|
||
特点:
|
||
- 支持100+语言(中英双语优化)
|
||
- 8192 tokens超长上下文
|
||
- Dense+Sparse双路检索能力
|
||
|
||
GitHub: https://github.com/FlagOpen/FlagEmbedding
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
model_name: str = "BAAI/bge-m3",
|
||
use_fp16: bool = True,
|
||
device: Optional[str] = None,
|
||
batch_size: int = 12,
|
||
max_length: int = 8192,
|
||
local_model_path: Optional[str] = None
|
||
):
|
||
"""
|
||
初始化BGE-M3嵌入模型
|
||
|
||
Args:
|
||
model_name: 模型名称(如果使用本地路径,此参数会被忽略)
|
||
use_fp16: 是否使用FP16加速
|
||
device: 设备类型(cuda/cpu),默认自动选择
|
||
batch_size: 批处理大小
|
||
max_length: 最大序列长度
|
||
local_model_path: 本地模型路径(可选,优先使用)
|
||
"""
|
||
self.use_fp16 = use_fp16
|
||
self.batch_size = batch_size
|
||
self.max_length = max_length
|
||
|
||
# 确定模型路径(优先使用本地路径)
|
||
if local_model_path and os.path.exists(local_model_path):
|
||
self.model_path = local_model_path
|
||
self.model_name = "local"
|
||
logger.info(f"使用本地模型路径: {local_model_path}")
|
||
else:
|
||
# 检查多个可能的本地路径
|
||
found_local = False
|
||
for path in LOCAL_MODEL_PATHS:
|
||
if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")):
|
||
self.model_path = path
|
||
self.model_name = "local"
|
||
logger.info(f"使用本地模型路径: {path}")
|
||
found_local = True
|
||
break
|
||
|
||
if not found_local:
|
||
self.model_path = model_name
|
||
self.model_name = model_name
|
||
logger.info(f"使用远程模型: {model_name}")
|
||
|
||
# 自动选择设备
|
||
if device is None:
|
||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||
else:
|
||
self.device = device
|
||
|
||
logger.info(f"初始化BGE-M3模型, 设备: {self.device}")
|
||
|
||
self.model = None
|
||
self._load_model()
|
||
|
||
def _load_model(self):
|
||
"""加载嵌入模型"""
|
||
try:
|
||
from FlagEmbedding import BGEM3FlagModel
|
||
|
||
self.model = BGEM3FlagModel(
|
||
self.model_path,
|
||
use_fp16=self.use_fp16,
|
||
device=self.device
|
||
)
|
||
|
||
logger.success(f"BGE-M3模型加载成功")
|
||
|
||
except ImportError:
|
||
logger.warning("FlagEmbedding库未安装,请运行: pip install FlagEmbedding")
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"模型加载失败: {e}")
|
||
raise
|
||
|
||
def embed(
|
||
self,
|
||
texts: List[str],
|
||
return_dense: bool = True,
|
||
return_sparse: bool = True,
|
||
return_colbert_vecs: bool = False
|
||
) -> EmbeddingResult:
|
||
"""
|
||
对文本列表生成嵌入向量
|
||
|
||
Args:
|
||
texts: 文本列表
|
||
return_dense: 是否返回Dense向量
|
||
return_sparse: 是否返回Sparse向量
|
||
return_colbert_vecs: 是否返回ColBERT向量
|
||
|
||
Returns:
|
||
EmbeddingResult: 嵌入结果
|
||
"""
|
||
if not texts:
|
||
logger.warning("输入文本列表为空")
|
||
return EmbeddingResult(
|
||
dense_embeddings=np.array([]),
|
||
sparse_embeddings=[],
|
||
texts=[],
|
||
dim=0
|
||
)
|
||
|
||
logger.info(f"开始嵌入{len(texts)}个文本块")
|
||
|
||
try:
|
||
# 执行嵌入
|
||
embeddings = self.model.encode(
|
||
texts,
|
||
batch_size=self.batch_size,
|
||
max_length=self.max_length,
|
||
return_dense=return_dense,
|
||
return_sparse=return_sparse,
|
||
return_colbert_vecs=return_colbert_vecs
|
||
)
|
||
|
||
# 提取结果
|
||
dense_embeddings = embeddings.get('dense_vecs', np.array([]))
|
||
sparse_embeddings = embeddings.get('lexical_weights', [])
|
||
|
||
# 获取维度
|
||
dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024
|
||
|
||
logger.success(f"嵌入完成,向量维度: {dim}")
|
||
|
||
return EmbeddingResult(
|
||
dense_embeddings=dense_embeddings,
|
||
sparse_embeddings=sparse_embeddings,
|
||
texts=texts,
|
||
dim=dim
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"嵌入失败: {e}")
|
||
raise
|
||
|
||
def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]:
|
||
"""
|
||
对单个文本生成嵌入向量
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
Dict: 包含dense和sparse向量
|
||
"""
|
||
result = self.embed([text])
|
||
return {
|
||
'dense': result.dense_embeddings[0],
|
||
'sparse': result.sparse_embeddings[0] if result.sparse_embeddings else {},
|
||
'dim': result.dim
|
||
}
|
||
|
||
def embed_dense(self, texts: List[str]) -> np.ndarray:
|
||
"""只生成Dense向量"""
|
||
result = self.embed(texts, return_sparse=False, return_colbert_vecs=False)
|
||
return result.dense_embeddings
|
||
|
||
def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]:
|
||
"""只生成Sparse向量"""
|
||
result = self.embed(texts, return_dense=False, return_colbert_vecs=False)
|
||
return result.sparse_embeddings
|
||
|
||
def embed_query(self, query: str) -> Dict:
|
||
"""
|
||
对查询文本生成嵌入(用于检索)
|
||
|
||
Args:
|
||
query: 查询文本
|
||
|
||
Returns:
|
||
Dict: 包含dense和sparse向量
|
||
"""
|
||
return self.embed_single(query)
|
||
|
||
def compute_similarity(
|
||
self,
|
||
query_embedding: np.ndarray,
|
||
doc_embeddings: np.ndarray,
|
||
metric: str = "cosine"
|
||
) -> np.ndarray:
|
||
"""
|
||
计算查询与文档的相似度
|
||
|
||
Args:
|
||
query_embedding: 查询向量
|
||
doc_embeddings: 文档向量矩阵
|
||
metric: 相似度度量(cosine/dot)
|
||
|
||
Returns:
|
||
np.ndarray: 相似度分数数组
|
||
"""
|
||
if metric == "cosine":
|
||
# 余弦相似度
|
||
query_norm = np.linalg.norm(query_embedding)
|
||
doc_norms = np.linalg.norm(doc_embeddings, axis=1)
|
||
|
||
similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm)
|
||
|
||
elif metric == "dot":
|
||
# 点积相似度
|
||
similarities = np.dot(doc_embeddings, query_embedding)
|
||
|
||
else:
|
||
raise ValueError(f"不支持的相似度度量: {metric}")
|
||
|
||
return similarities
|
||
|
||
def sparse_similarity(
|
||
self,
|
||
query_sparse: Dict[int, float],
|
||
doc_sparse: Dict[int, float]
|
||
) -> float:
|
||
"""
|
||
计算Sparse向量的相似度(BM25风格)
|
||
|
||
Args:
|
||
query_sparse: 查询的Sparse向量(词ID -> 权重)
|
||
doc_sparse: 文档的Sparse向量
|
||
|
||
Returns:
|
||
float: 相似度分数
|
||
"""
|
||
# 计算交集词的点积
|
||
common_keys = set(query_sparse.keys()) & set(doc_sparse.keys())
|
||
|
||
score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys)
|
||
return score
|
||
|
||
|
||
def embed_texts(
|
||
texts: List[str],
|
||
model_name: str = "BAAI/bge-m3",
|
||
**kwargs
|
||
) -> EmbeddingResult:
|
||
"""便捷函数:对文本列表生成嵌入"""
|
||
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
|
||
return embedder.embed(texts)
|
||
|
||
|
||
def embed_single_text(
|
||
text: str,
|
||
model_name: str = "BAAI/bge-m3",
|
||
**kwargs
|
||
) -> Dict:
|
||
"""便捷函数:对单个文本生成嵌入"""
|
||
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
|
||
return embedder.embed_single(text) |