Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -1,4 +1,4 @@
"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成"""
"""Provide service-layer logic for bge m3 embedder."""
import numpy as np
from typing import List, Dict, Optional, Union
@@ -6,43 +6,31 @@ from dataclasses import dataclass, field
from loguru import logger
import torch
import os
# Keep service responsibilities explicit so downstream behavior stays predictable.
# 设置HuggingFace镜像国内网络
# Keep service responsibilities explicit so downstream behavior stays predictable.
if 'HF_ENDPOINT' not in os.environ:
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 本地模型路径(按优先级检查)
# Keep service responsibilities explicit so downstream behavior stays predictable.
LOCAL_MODEL_PATHS = [
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # ModelScope下载路径
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # HuggingFace本地路径
os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"), # Keep service responsibilities explicit so downstream behavior stays predictable.
os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"), # Keep service responsibilities explicit so downstream behavior stays predictable.
]
@dataclass
class EmbeddingResult:
"""嵌入结果"""
dense_embeddings: np.ndarray # Dense向量语义检索
sparse_embeddings: List[Dict[int, float]] # Sparse向量关键词匹配
"""Represent the Embedding Result type."""
dense_embeddings: np.ndarray # Keep service responsibilities explicit so downstream behavior stays predictable.
sparse_embeddings: List[Dict[int, float]] # Keep service responsibilities explicit so downstream behavior stays predictable.
texts: List[str]
dim: int = 1024
class BGEM3Embedder:
"""
BGE-M3多语言嵌入模型服务
BGE-M3是BAAI发布的多语言嵌入模型支持
- Dense向量用于语义相似度检索
- Sparse向量用于关键词精确匹配BM25风格
- ColBERT向量用于细粒度交互匹配可选
特点:
- 支持100+语言(中英双语优化)
- 8192 tokens超长上下文
- Dense+Sparse双路检索能力
GitHub: https://github.com/FlagOpen/FlagEmbedding
"""
"""Represent the B G E M3 Embedder type."""
def __init__(
self,
@@ -53,28 +41,18 @@ class BGEM3Embedder:
max_length: int = 8192,
local_model_path: Optional[str] = None
):
"""
初始化BGE-M3嵌入模型
Args:
model_name: 模型名称(如果使用本地路径,此参数会被忽略)
use_fp16: 是否使用FP16加速
device: 设备类型cuda/cpu默认自动选择
batch_size: 批处理大小
max_length: 最大序列长度
local_model_path: 本地模型路径(可选,优先使用)
"""
"""Initialize the B G E M3 Embedder instance."""
self.use_fp16 = use_fp16
self.batch_size = batch_size
self.max_length = max_length
# 确定模型路径(优先使用本地路径)
# Keep service responsibilities explicit so downstream behavior stays predictable.
if local_model_path and os.path.exists(local_model_path):
self.model_path = local_model_path
self.model_name = "local"
logger.info(f"使用本地模型路径: {local_model_path}")
else:
# 检查多个可能的本地路径
# Keep service responsibilities explicit so downstream behavior stays predictable.
found_local = False
for path in LOCAL_MODEL_PATHS:
if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")):
@@ -89,7 +67,7 @@ class BGEM3Embedder:
self.model_name = model_name
logger.info(f"使用远程模型: {model_name}")
# 自动选择设备
# Keep service responsibilities explicit so downstream behavior stays predictable.
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
@@ -101,7 +79,7 @@ class BGEM3Embedder:
self._load_model()
def _load_model(self):
"""加载嵌入模型"""
"""Handle load model for this module for the B G E M3 Embedder instance."""
try:
from FlagEmbedding import BGEM3FlagModel
@@ -127,18 +105,7 @@ class BGEM3Embedder:
return_sparse: bool = True,
return_colbert_vecs: bool = False
) -> EmbeddingResult:
"""
对文本列表生成嵌入向量
Args:
texts: 文本列表
return_dense: 是否返回Dense向量
return_sparse: 是否返回Sparse向量
return_colbert_vecs: 是否返回ColBERT向量
Returns:
EmbeddingResult: 嵌入结果
"""
"""Handle embed for the B G E M3 Embedder instance."""
if not texts:
logger.warning("输入文本列表为空")
return EmbeddingResult(
@@ -151,7 +118,7 @@ class BGEM3Embedder:
logger.info(f"开始嵌入{len(texts)}个文本块")
try:
# 执行嵌入
# Keep service responsibilities explicit so downstream behavior stays predictable.
embeddings = self.model.encode(
texts,
batch_size=self.batch_size,
@@ -161,11 +128,11 @@ class BGEM3Embedder:
return_colbert_vecs=return_colbert_vecs
)
# 提取结果
# Keep service responsibilities explicit so downstream behavior stays predictable.
dense_embeddings = embeddings.get('dense_vecs', np.array([]))
sparse_embeddings = embeddings.get('lexical_weights', [])
# 获取维度
# Keep service responsibilities explicit so downstream behavior stays predictable.
dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024
logger.success(f"嵌入完成,向量维度: {dim}")
@@ -182,15 +149,7 @@ class BGEM3Embedder:
raise
def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]:
"""
对单个文本生成嵌入向量
Args:
text: 输入文本
Returns:
Dict: 包含dense和sparse向量
"""
"""Embed single for the B G E M3 Embedder instance."""
result = self.embed([text])
return {
'dense': result.dense_embeddings[0],
@@ -199,25 +158,17 @@ class BGEM3Embedder:
}
def embed_dense(self, texts: List[str]) -> np.ndarray:
"""只生成Dense向量"""
"""Embed dense for the B G E M3 Embedder instance."""
result = self.embed(texts, return_sparse=False, return_colbert_vecs=False)
return result.dense_embeddings
def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]:
"""只生成Sparse向量"""
"""Embed sparse for the B G E M3 Embedder instance."""
result = self.embed(texts, return_dense=False, return_colbert_vecs=False)
return result.sparse_embeddings
def embed_query(self, query: str) -> Dict:
"""
对查询文本生成嵌入(用于检索)
Args:
query: 查询文本
Returns:
Dict: 包含dense和sparse向量
"""
"""Embed query for the B G E M3 Embedder instance."""
return self.embed_single(query)
def compute_similarity(
@@ -226,26 +177,16 @@ class BGEM3Embedder:
doc_embeddings: np.ndarray,
metric: str = "cosine"
) -> np.ndarray:
"""
计算查询与文档的相似度
Args:
query_embedding: 查询向量
doc_embeddings: 文档向量矩阵
metric: 相似度度量cosine/dot
Returns:
np.ndarray: 相似度分数数组
"""
"""Handle compute similarity for the B G E M3 Embedder instance."""
if metric == "cosine":
# 余弦相似度
# Keep service responsibilities explicit so downstream behavior stays predictable.
query_norm = np.linalg.norm(query_embedding)
doc_norms = np.linalg.norm(doc_embeddings, axis=1)
similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm)
elif metric == "dot":
# 点积相似度
# Keep service responsibilities explicit so downstream behavior stays predictable.
similarities = np.dot(doc_embeddings, query_embedding)
else:
@@ -258,17 +199,8 @@ class BGEM3Embedder:
query_sparse: Dict[int, float],
doc_sparse: Dict[int, float]
) -> float:
"""
计算Sparse向量的相似度BM25风格
Args:
query_sparse: 查询的Sparse向量词ID -> 权重)
doc_sparse: 文档的Sparse向量
Returns:
float: 相似度分数
"""
# 计算交集词的点积
"""Handle sparse similarity for the B G E M3 Embedder instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
common_keys = set(query_sparse.keys()) & set(doc_sparse.keys())
score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys)
@@ -280,7 +212,7 @@ def embed_texts(
model_name: str = "BAAI/bge-m3",
**kwargs
) -> EmbeddingResult:
"""便捷函数:对文本列表生成嵌入"""
"""Embed texts."""
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
return embedder.embed(texts)
@@ -290,6 +222,6 @@ def embed_single_text(
model_name: str = "BAAI/bge-m3",
**kwargs
) -> Dict:
"""便捷函数:对单个文本生成嵌入"""
"""Embed single text."""
embedder = BGEM3Embedder(model_name=model_name, **kwargs)
return embedder.embed_single(text)