"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成""" import numpy as np from typing import List, Dict, Optional, Union from dataclasses import dataclass, field from loguru import logger import torch import os # 设置HuggingFace镜像(国内网络) if 'HF_ENDPOINT' not in os.environ: os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # 本地模型路径(按优先级检查) LOCAL_MODEL_PATHS = [ os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # ModelScope下载路径 os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # HuggingFace本地路径 ] @dataclass class EmbeddingResult: """嵌入结果""" dense_embeddings: np.ndarray # Dense向量(语义检索) sparse_embeddings: List[Dict[int, float]] # Sparse向量(关键词匹配) texts: List[str] dim: int = 1024 class BGEM3Embedder: """ BGE-M3多语言嵌入模型服务 BGE-M3是BAAI发布的多语言嵌入模型,支持: - Dense向量:用于语义相似度检索 - Sparse向量:用于关键词精确匹配(BM25风格) - ColBERT向量:用于细粒度交互匹配(可选) 特点: - 支持100+语言(中英双语优化) - 8192 tokens超长上下文 - Dense+Sparse双路检索能力 GitHub: https://github.com/FlagOpen/FlagEmbedding """ def __init__( self, model_name: str = "BAAI/bge-m3", use_fp16: bool = True, device: Optional[str] = None, batch_size: int = 12, max_length: int = 8192, local_model_path: Optional[str] = None ): """ 初始化BGE-M3嵌入模型 Args: model_name: 模型名称(如果使用本地路径,此参数会被忽略) use_fp16: 是否使用FP16加速 device: 设备类型(cuda/cpu),默认自动选择 batch_size: 批处理大小 max_length: 最大序列长度 local_model_path: 本地模型路径(可选,优先使用) """ self.use_fp16 = use_fp16 self.batch_size = batch_size self.max_length = max_length # 确定模型路径(优先使用本地路径) if local_model_path and os.path.exists(local_model_path): self.model_path = local_model_path self.model_name = "local" logger.info(f"使用本地模型路径: {local_model_path}") else: # 检查多个可能的本地路径 found_local = False for path in LOCAL_MODEL_PATHS: if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")): self.model_path = path self.model_name = "local" logger.info(f"使用本地模型路径: {path}") found_local = True break if not found_local: self.model_path = model_name self.model_name = model_name logger.info(f"使用远程模型: {model_name}") # 自动选择设备 if device is None: self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = device logger.info(f"初始化BGE-M3模型, 设备: {self.device}") self.model = None self._load_model() def _load_model(self): """加载嵌入模型""" try: from FlagEmbedding import BGEM3FlagModel self.model = BGEM3FlagModel( self.model_path, use_fp16=self.use_fp16, device=self.device ) logger.success(f"BGE-M3模型加载成功") except ImportError: logger.warning("FlagEmbedding库未安装,请运行: pip install FlagEmbedding") raise except Exception as e: logger.error(f"模型加载失败: {e}") raise def embed( self, texts: List[str], return_dense: bool = True, return_sparse: bool = True, return_colbert_vecs: bool = False ) -> EmbeddingResult: """ 对文本列表生成嵌入向量 Args: texts: 文本列表 return_dense: 是否返回Dense向量 return_sparse: 是否返回Sparse向量 return_colbert_vecs: 是否返回ColBERT向量 Returns: EmbeddingResult: 嵌入结果 """ if not texts: logger.warning("输入文本列表为空") return EmbeddingResult( dense_embeddings=np.array([]), sparse_embeddings=[], texts=[], dim=0 ) logger.info(f"开始嵌入{len(texts)}个文本块") try: # 执行嵌入 embeddings = self.model.encode( texts, batch_size=self.batch_size, max_length=self.max_length, return_dense=return_dense, return_sparse=return_sparse, return_colbert_vecs=return_colbert_vecs ) # 提取结果 dense_embeddings = embeddings.get('dense_vecs', np.array([])) sparse_embeddings = embeddings.get('lexical_weights', []) # 获取维度 dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024 logger.success(f"嵌入完成,向量维度: {dim}") return EmbeddingResult( dense_embeddings=dense_embeddings, sparse_embeddings=sparse_embeddings, texts=texts, dim=dim ) except Exception as e: logger.error(f"嵌入失败: {e}") raise def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]: """ 对单个文本生成嵌入向量 Args: text: 输入文本 Returns: Dict: 包含dense和sparse向量 """ result = self.embed([text]) return { 'dense': result.dense_embeddings[0], 'sparse': result.sparse_embeddings[0] if result.sparse_embeddings else {}, 'dim': result.dim } def embed_dense(self, texts: List[str]) -> np.ndarray: """只生成Dense向量""" result = self.embed(texts, return_sparse=False, return_colbert_vecs=False) return result.dense_embeddings def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]: """只生成Sparse向量""" result = self.embed(texts, return_dense=False, return_colbert_vecs=False) return result.sparse_embeddings def embed_query(self, query: str) -> Dict: """ 对查询文本生成嵌入(用于检索) Args: query: 查询文本 Returns: Dict: 包含dense和sparse向量 """ return self.embed_single(query) def compute_similarity( self, query_embedding: np.ndarray, doc_embeddings: np.ndarray, metric: str = "cosine" ) -> np.ndarray: """ 计算查询与文档的相似度 Args: query_embedding: 查询向量 doc_embeddings: 文档向量矩阵 metric: 相似度度量(cosine/dot) Returns: np.ndarray: 相似度分数数组 """ if metric == "cosine": # 余弦相似度 query_norm = np.linalg.norm(query_embedding) doc_norms = np.linalg.norm(doc_embeddings, axis=1) similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm) elif metric == "dot": # 点积相似度 similarities = np.dot(doc_embeddings, query_embedding) else: raise ValueError(f"不支持的相似度度量: {metric}") return similarities def sparse_similarity( self, query_sparse: Dict[int, float], doc_sparse: Dict[int, float] ) -> float: """ 计算Sparse向量的相似度(BM25风格) Args: query_sparse: 查询的Sparse向量(词ID -> 权重) doc_sparse: 文档的Sparse向量 Returns: float: 相似度分数 """ # 计算交集词的点积 common_keys = set(query_sparse.keys()) & set(doc_sparse.keys()) score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys) return score def embed_texts( texts: List[str], model_name: str = "BAAI/bge-m3", **kwargs ) -> EmbeddingResult: """便捷函数:对文本列表生成嵌入""" embedder = BGEM3Embedder(model_name=model_name, **kwargs) return embedder.embed(texts) def embed_single_text( text: str, model_name: str = "BAAI/bge-m3", **kwargs ) -> Dict: """便捷函数:对单个文本生成嵌入""" embedder = BGEM3Embedder(model_name=model_name, **kwargs) return embedder.embed_single(text)