update

2026-05-14 15:07:34 +08:00
parent c2a398930d
commit 10d04c4083
179 changed files with 24073 additions and 1243 deletions
--- a/backend/app/services/embedding/init.py
+++ b/backend/app/services/embedding/init.py
@@ -0,0 +1,7 @@
+# src/services/embedding/__init__.py
+"""嵌入和分块服务"""
+
+from .text_chunker import RegulationChunker
+from .bge_m3_embedder import BGEM3Embedder
+
+__all__ = ["RegulationChunker", "BGEM3Embedder"]
--- a/backend/app/services/embedding/bge_m3_embedder.py
+++ b/backend/app/services/embedding/bge_m3_embedder.py
@@ -0,0 +1,296 @@
+# src/services/embedding/bge_m3_embedder.py
+"""BGE-M3嵌入服务 - Dense+Sparse双路向量生成"""
+
+import numpy as np
+from typing import List, Dict, Optional, Union
+from dataclasses import dataclass, field
+from loguru import logger
+import torch
+import os
+
+# 设置HuggingFace镜像（国内网络）
+if 'HF_ENDPOINT' not in os.environ:
+    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+
+# 本地模型路径（按优先级检查）
+LOCAL_MODEL_PATHS = [
+    os.path.expanduser("~/.cache/modelscope/Xorbits/bge-m3"),  # ModelScope下载路径
+    os.path.expanduser("~/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/main"),  # HuggingFace本地路径
+]
+
+
+@dataclass
+class EmbeddingResult:
+    """嵌入结果"""
+    dense_embeddings: np.ndarray  # Dense向量（语义检索）
+    sparse_embeddings: List[Dict[int, float]]  # Sparse向量（关键词匹配）
+    texts: List[str]
+    dim: int = 1024
+
+
+class BGEM3Embedder:
+    """
+    BGE-M3多语言嵌入模型服务
+
+    BGE-M3是BAAI发布的多语言嵌入模型，支持：
+    - Dense向量：用于语义相似度检索
+    - Sparse向量：用于关键词精确匹配（BM25风格）
+    - ColBERT向量：用于细粒度交互匹配（可选）
+
+    特点：
+    - 支持100+语言（中英双语优化）
+    - 8192 tokens超长上下文
+    - Dense+Sparse双路检索能力
+
+    GitHub: https://github.com/FlagOpen/FlagEmbedding
+    """
+
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-m3",
+        use_fp16: bool = True,
+        device: Optional[str] = None,
+        batch_size: int = 12,
+        max_length: int = 8192,
+        local_model_path: Optional[str] = None
+    ):
+        """
+        初始化BGE-M3嵌入模型
+
+        Args:
+            model_name: 模型名称（如果使用本地路径，此参数会被忽略）
+            use_fp16: 是否使用FP16加速
+            device: 设备类型（cuda/cpu），默认自动选择
+            batch_size: 批处理大小
+            max_length: 最大序列长度
+            local_model_path: 本地模型路径（可选，优先使用）
+        """
+        self.use_fp16 = use_fp16
+        self.batch_size = batch_size
+        self.max_length = max_length
+
+        # 确定模型路径（优先使用本地路径）
+        if local_model_path and os.path.exists(local_model_path):
+            self.model_path = local_model_path
+            self.model_name = "local"
+            logger.info(f"使用本地模型路径: {local_model_path}")
+        else:
+            # 检查多个可能的本地路径
+            found_local = False
+            for path in LOCAL_MODEL_PATHS:
+                if os.path.exists(path) and os.path.exists(os.path.join(path, "config.json")):
+                    self.model_path = path
+                    self.model_name = "local"
+                    logger.info(f"使用本地模型路径: {path}")
+                    found_local = True
+                    break
+
+            if not found_local:
+                self.model_path = model_name
+                self.model_name = model_name
+                logger.info(f"使用远程模型: {model_name}")
+
+        # 自动选择设备
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+
+        logger.info(f"初始化BGE-M3模型, 设备: {self.device}")
+
+        self.model = None
+        self._load_model()
+
+    def _load_model(self):
+        """加载嵌入模型"""
+        try:
+            from FlagEmbedding import BGEM3FlagModel
+
+            self.model = BGEM3FlagModel(
+                self.model_path,
+                use_fp16=self.use_fp16,
+                device=self.device
+            )
+
+            logger.success(f"BGE-M3模型加载成功")
+
+        except ImportError:
+            logger.warning("FlagEmbedding库未安装，请运行: pip install FlagEmbedding")
+            raise
+        except Exception as e:
+            logger.error(f"模型加载失败: {e}")
+            raise
+
+    def embed(
+        self,
+        texts: List[str],
+        return_dense: bool = True,
+        return_sparse: bool = True,
+        return_colbert_vecs: bool = False
+    ) -> EmbeddingResult:
+        """
+        对文本列表生成嵌入向量
+
+        Args:
+            texts: 文本列表
+            return_dense: 是否返回Dense向量
+            return_sparse: 是否返回Sparse向量
+            return_colbert_vecs: 是否返回ColBERT向量
+
+        Returns:
+            EmbeddingResult: 嵌入结果
+        """
+        if not texts:
+            logger.warning("输入文本列表为空")
+            return EmbeddingResult(
+                dense_embeddings=np.array([]),
+                sparse_embeddings=[],
+                texts=[],
+                dim=0
+            )
+
+        logger.info(f"开始嵌入{len(texts)}个文本块")
+
+        try:
+            # 执行嵌入
+            embeddings = self.model.encode(
+                texts,
+                batch_size=self.batch_size,
+                max_length=self.max_length,
+                return_dense=return_dense,
+                return_sparse=return_sparse,
+                return_colbert_vecs=return_colbert_vecs
+            )
+
+            # 提取结果
+            dense_embeddings = embeddings.get('dense_vecs', np.array([]))
+            sparse_embeddings = embeddings.get('lexical_weights', [])
+
+            # 获取维度
+            dim = dense_embeddings.shape[1] if len(dense_embeddings) > 0 else 1024
+
+            logger.success(f"嵌入完成，向量维度: {dim}")
+
+            return EmbeddingResult(
+                dense_embeddings=dense_embeddings,
+                sparse_embeddings=sparse_embeddings,
+                texts=texts,
+                dim=dim
+            )
+
+        except Exception as e:
+            logger.error(f"嵌入失败: {e}")
+            raise
+
+    def embed_single(self, text: str) -> Dict[str, Union[np.ndarray, Dict]]:
+        """
+        对单个文本生成嵌入向量
+
+        Args:
+            text: 输入文本
+
+        Returns:
+            Dict: 包含dense和sparse向量
+        """
+        result = self.embed([text])
+        return {
+            'dense': result.dense_embeddings[0],
+            'sparse': result.sparse_embeddings[0] if result.sparse_embeddings else {},
+            'dim': result.dim
+        }
+
+    def embed_dense(self, texts: List[str]) -> np.ndarray:
+        """只生成Dense向量"""
+        result = self.embed(texts, return_sparse=False, return_colbert_vecs=False)
+        return result.dense_embeddings
+
+    def embed_sparse(self, texts: List[str]) -> List[Dict[int, float]]:
+        """只生成Sparse向量"""
+        result = self.embed(texts, return_dense=False, return_colbert_vecs=False)
+        return result.sparse_embeddings
+
+    def embed_query(self, query: str) -> Dict:
+        """
+        对查询文本生成嵌入（用于检索）
+
+        Args:
+            query: 查询文本
+
+        Returns:
+            Dict: 包含dense和sparse向量
+        """
+        return self.embed_single(query)
+
+    def compute_similarity(
+        self,
+        query_embedding: np.ndarray,
+        doc_embeddings: np.ndarray,
+        metric: str = "cosine"
+    ) -> np.ndarray:
+        """
+        计算查询与文档的相似度
+
+        Args:
+            query_embedding: 查询向量
+            doc_embeddings: 文档向量矩阵
+            metric: 相似度度量（cosine/dot）
+
+        Returns:
+            np.ndarray: 相似度分数数组
+        """
+        if metric == "cosine":
+            # 余弦相似度
+            query_norm = np.linalg.norm(query_embedding)
+            doc_norms = np.linalg.norm(doc_embeddings, axis=1)
+
+            similarities = np.dot(doc_embeddings, query_embedding) / (doc_norms * query_norm)
+
+        elif metric == "dot":
+            # 点积相似度
+            similarities = np.dot(doc_embeddings, query_embedding)
+
+        else:
+            raise ValueError(f"不支持的相似度度量: {metric}")
+
+        return similarities
+
+    def sparse_similarity(
+        self,
+        query_sparse: Dict[int, float],
+        doc_sparse: Dict[int, float]
+    ) -> float:
+        """
+        计算Sparse向量的相似度（BM25风格）
+
+        Args:
+            query_sparse: 查询的Sparse向量（词ID -> 权重）
+            doc_sparse: 文档的Sparse向量
+
+        Returns:
+            float: 相似度分数
+        """
+        # 计算交集词的点积
+        common_keys = set(query_sparse.keys()) & set(doc_sparse.keys())
+
+        score = sum(query_sparse[k] * doc_sparse[k] for k in common_keys)
+        return score
+
+
+def embed_texts(
+    texts: List[str],
+    model_name: str = "BAAI/bge-m3",
+    **kwargs
+) -> EmbeddingResult:
+    """便捷函数：对文本列表生成嵌入"""
+    embedder = BGEM3Embedder(model_name=model_name, **kwargs)
+    return embedder.embed(texts)
+
+
+def embed_single_text(
+    text: str,
+    model_name: str = "BAAI/bge-m3",
+    **kwargs
+) -> Dict:
+    """便捷函数：对单个文本生成嵌入"""
+    embedder = BGEM3Embedder(model_name=model_name, **kwargs)
+    return embedder.embed_single(text)
--- a/backend/app/services/embedding/text_chunker.py
+++ b/backend/app/services/embedding/text_chunker.py
@@ -0,0 +1,449 @@
+# src/services/embedding/text_chunker.py
+"""智能分块器 - 章节级+条款级双粒度切割"""
+
+import re
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass, field
+from loguru import logger
+
+
+@dataclass
+class ChunkMetadata:
+    """分块元数据"""
+    doc_id: str = ""
+    doc_name: str = ""
+    chunk_id: str = ""
+    section_number: str = ""  # 章节编号（如 "第一章"）
+    section_title: str = ""   # 章节标题
+    clause_number: str = ""   # 条款编号（如 "第一条"）
+    page_number: int = 0
+    start_position: int = 0   # 在原文中的起始位置
+    end_position: int = 0     # 在原文中的结束位置
+    regulation_type: str = ""  # 法规类型
+    version: str = ""
+
+
+@dataclass
+class TextChunk:
+    """文本分块"""
+    content: str
+    metadata: ChunkMetadata
+    token_count: int = 0  # 估算的token数量
+
+
+class RegulationChunker:
+    """
+    法规文档智能分块器
+
+    实现章节级/条款级双粒度切割，适配国标GB文档结构：
+    - 国标文档通常有明确的层级结构：章 > 节 > 条
+    - 每个条款应作为一个独立的语义单元
+    - 保留条款完整性，避免跨条款截断
+    """
+
+    # 法规标题模式
+    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s+[^\n]+')
+    SECTION_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+节\s+[^\n]+')
+    CLAUSE_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+条\s')
+
+    # 条款子项模式
+    SUB_ITEM_PATTERN = re.compile(r'^[\(（][一二三四五六七八九十]+[\)）]\s')
+    NUMBER_ITEM_PATTERN = re.compile(r'^[\d]+[\.、]\s')
+
+    def __init__(
+        self,
+        chunk_size: int = 512,
+        chunk_overlap: int = 50,
+        max_chunk_size: int = 2048,
+        min_chunk_size: int = 100
+    ):
+        """
+        初始化分块器
+
+        Args:
+            chunk_size: 默认分块大小（字符数）
+            chunk_overlap: 分块重叠大小
+            max_chunk_size: 最大分块大小（防止单个条款过长）
+            min_chunk_size: 最小分块大小（防止碎片化）
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.max_chunk_size = max_chunk_size
+        self.min_chunk_size = min_chunk_size
+
+    def chunk_document(
+        self,
+        markdown_text: str,
+        doc_id: str = "",
+        doc_name: str = "",
+        regulation_type: str = "",
+        version: str = ""
+    ) -> List[TextChunk]:
+        """
+        对法规文档进行智能分块
+
+        Args:
+            markdown_text: Markdown格式的文档内容
+            doc_id: 文档ID
+            doc_name: 文档名称
+            regulation_type: 法规类型
+            version: 文档版本
+
+        Returns:
+            List[TextChunk]: 分块列表
+        """
+        logger.info(f"开始分块文档: {doc_name}")
+
+        # 1. 按章节分割（一级分块）
+        sections = self._split_by_sections(markdown_text)
+
+        # 2. 在每个章节内按条款分割（二级分块）
+        chunks = []
+        global_position = 0
+
+        for section_num, section_title, section_content, section_start in sections:
+            # 在章节内按条款分割
+            clause_chunks = self._split_by_clauses(
+                section_content,
+                section_num,
+                section_title,
+                section_start + global_position
+            )
+
+            for chunk_content, clause_num, clause_title, start_pos, end_pos in clause_chunks:
+                # 处理过长的条款（进一步细分）
+                if len(chunk_content) > self.max_chunk_size:
+                    sub_chunks = self._split_long_clause(
+                        chunk_content,
+                        clause_num,
+                        clause_title
+                    )
+                    for sub_content, sub_start, sub_end in sub_chunks:
+                        chunk = self._create_chunk(
+                            sub_content,
+                            doc_id,
+                            doc_name,
+                            section_num,
+                            section_title,
+                            clause_num,
+                            sub_start + start_pos,
+                            sub_end + start_pos,
+                            regulation_type,
+                            version
+                        )
+                        chunks.append(chunk)
+                else:
+                    chunk = self._create_chunk(
+                        chunk_content,
+                        doc_id,
+                        doc_name,
+                        section_num,
+                        section_title,
+                        clause_num,
+                        start_pos,
+                        end_pos,
+                        regulation_type,
+                        version
+                    )
+                    chunks.append(chunk)
+
+        logger.success(f"分块完成，共{len(chunks)}个chunk")
+        return chunks
+
+    def _split_by_sections(self, markdown_text: str) -> List[Tuple[str, str, str, int]]:
+        """
+        按章节分割文档
+
+        Returns:
+            List of (section_number, section_title, section_content, start_position)
+        """
+        sections = []
+        lines = markdown_text.split('\n')
+
+        current_section_num = ""
+        current_section_title = ""
+        current_section_content = []
+        current_section_start = 0
+
+        for i, line in enumerate(lines):
+            # 检测章节标题
+            chapter_match = self.CHAPTER_PATTERN.match(line.strip())
+            section_match = self.SECTION_PATTERN.match(line.strip())
+
+            if chapter_match or section_match:
+                # 保存上一个章节
+                if current_section_content:
+                    content = '\n'.join(current_section_content)
+                    sections.append((
+                        current_section_num,
+                        current_section_title,
+                        content,
+                        current_section_start
+                    ))
+
+                # 开始新章节
+                current_section_start = sum(len(l) + 1 for l in lines[:i])
+                current_section_content = []
+
+                if chapter_match:
+                    current_section_num = line.strip()
+                    current_section_title = self._extract_title(line.strip())
+                else:
+                    current_section_num = line.strip()
+                    current_section_title = self._extract_title(line.strip())
+
+            current_section_content.append(line)
+
+        # 保存最后一个章节
+        if current_section_content:
+            content = '\n'.join(current_section_content)
+            sections.append((
+                current_section_num,
+                current_section_title,
+                content,
+                current_section_start
+            ))
+
+        # 如果没有检测到章节，将整个文档作为一个大章节
+        if not sections:
+            sections.append((
+                "",
+                "全文",
+                markdown_text,
+                0
+            ))
+
+        return sections
+
+    def _split_by_clauses(
+        self,
+        section_content: str,
+        section_num: str,
+        section_title: str,
+        section_start: int
+    ) -> List[Tuple[str, str, str, int, int]]:
+        """
+        在章节内按条款分割
+
+        Returns:
+            List of (content, clause_number, clause_title, start_position, end_position)
+        """
+        clauses = []
+        lines = section_content.split('\n')
+
+        current_clause_num = ""
+        current_clause_title = ""
+        current_clause_content = []
+        current_clause_start = section_start
+
+        for i, line in enumerate(lines):
+            # 检测条款标题
+            clause_match = self.CLAUSE_PATTERN.match(line.strip())
+
+            if clause_match:
+                # 保存上一个条款
+                if current_clause_content:
+                    content = '\n'.join(current_clause_content)
+                    end_pos = current_clause_start + len(content)
+                    clauses.append((
+                        content,
+                        current_clause_num,
+                        current_clause_title,
+                        current_clause_start,
+                        end_pos
+                    ))
+
+                # 开始新条款
+                current_clause_start = section_start + sum(len(l) + 1 for l in lines[:i])
+                current_clause_content = []
+                current_clause_num = self._extract_clause_number(line.strip())
+                current_clause_title = line.strip()
+
+            current_clause_content.append(line)
+
+        # 保存最后一个条款
+        if current_clause_content:
+            content = '\n'.join(current_clause_content)
+            end_pos = current_clause_start + len(content)
+            clauses.append((
+                content,
+                current_clause_num,
+                current_clause_title,
+                current_clause_start,
+                end_pos
+            ))
+
+        # 如果没有检测到条款，将整个章节作为一个条款
+        if not clauses:
+            clauses.append((
+                section_content,
+                "",
+                section_title,
+                section_start,
+                section_start + len(section_content)
+            ))
+
+        return clauses
+
+    def _split_long_clause(
+        self,
+        content: str,
+        clause_num: str,
+        clause_title: str
+    ) -> List[Tuple[str, int, int]]:
+        """
+        分割过长的条款内容
+
+        按条款子项或段落分割，保持语义完整性
+        """
+        sub_chunks = []
+        lines = content.split('\n')
+
+        # 检测是否有子项结构
+        has_sub_items = any(
+            self.SUB_ITEM_PATTERN.match(line.strip()) or
+            self.NUMBER_ITEM_PATTERN.match(line.strip())
+            for line in lines
+        )
+
+        if has_sub_items:
+            # 按子项分割
+            current_sub_content = []
+            current_sub_start = 0
+
+            for i, line in enumerate(lines):
+                is_sub_item = (
+                    self.SUB_ITEM_PATTERN.match(line.strip()) or
+                    self.NUMBER_ITEM_PATTERN.match(line.strip())
+                )
+
+                if is_sub_item and current_sub_content:
+                    sub_content = '\n'.join(current_sub_content)
+                    sub_end = current_sub_start + len(sub_content)
+                    if len(sub_content) >= self.min_chunk_size:
+                        sub_chunks.append((sub_content, current_sub_start, sub_end))
+                    current_sub_content = []
+                    current_sub_start = sum(len(l) + 1 for l in lines[:i])
+
+                current_sub_content.append(line)
+
+            # 保存最后一个子项
+            if current_sub_content:
+                sub_content = '\n'.join(current_sub_content)
+                sub_end = current_sub_start + len(sub_content)
+                sub_chunks.append((sub_content, current_sub_start, sub_end))
+
+        else:
+            # 按段落分割（滑动窗口）
+            paragraphs = []
+            current_para = []
+
+            for line in lines:
+                if line.strip():
+                    current_para.append(line)
+                else:
+                    if current_para:
+                        paragraphs.append('\n'.join(current_para))
+                        current_para = []
+
+            if current_para:
+                paragraphs.append('\n'.join(current_para))
+
+            # 合并段落直到达到chunk_size
+            current_chunk = []
+            current_length = 0
+            chunk_start = 0
+
+            for para in paragraphs:
+                if current_length + len(para) > self.chunk_size and current_chunk:
+                    chunk_content = '\n'.join(current_chunk)
+                    chunk_end = chunk_start + len(chunk_content)
+                    sub_chunks.append((chunk_content, chunk_start, chunk_end))
+                    current_chunk = []
+                    current_length = 0
+                    chunk_start = chunk_end
+
+                current_chunk.append(para)
+                current_length += len(para)
+
+            # 保存最后一个chunk
+            if current_chunk:
+                chunk_content = '\n'.join(current_chunk)
+                chunk_end = chunk_start + len(chunk_content)
+                sub_chunks.append((chunk_content, chunk_start, chunk_end))
+
+        return sub_chunks
+
+    def _extract_title(self, header_line: str) -> str:
+        """从标题行提取标题内容"""
+        # 移除"第X章"、"第X节"前缀
+        title = re.sub(r'^第[一二三四五六七八九十百]+[章节]\s+', '', header_line)
+        return title.strip()
+
+    def _extract_clause_number(self, clause_line: str) -> str:
+        """从条款行提取条款编号"""
+        match = self.CLAUSE_PATTERN.match(clause_line)
+        if match:
+            return match.group(0).strip()
+        return ""
+
+    def _create_chunk(
+        self,
+        content: str,
+        doc_id: str,
+        doc_name: str,
+        section_num: str,
+        section_title: str,
+        clause_num: str,
+        start_pos: int,
+        end_pos: int,
+        regulation_type: str,
+        version: str
+    ) -> TextChunk:
+        """创建文本分块"""
+        # 清理内容
+        content = content.strip()
+
+        # 计算估算token数（中文约1.5字符/token）
+        token_count = int(len(content) * 0.7)  # 简化估算
+
+        # 生成chunk_id
+        chunk_id = f"{doc_id}_{section_num}_{clause_num}_{start_pos}"
+
+        metadata = ChunkMetadata(
+            doc_id=doc_id,
+            doc_name=doc_name,
+            chunk_id=chunk_id,
+            section_number=section_num,
+            section_title=section_title,
+            clause_number=clause_num,
+            start_position=start_pos,
+            end_position=end_pos,
+            regulation_type=regulation_type,
+            version=version
+        )
+
+        return TextChunk(
+            content=content,
+            metadata=metadata,
+            token_count=token_count
+        )
+
+
+def chunk_regulation_document(
+    markdown_text: str,
+    doc_id: str = "",
+    doc_name: str = "",
+    regulation_type: str = "",
+    version: str = "",
+    chunk_size: int = 512
+) -> List[TextChunk]:
+    """便捷函数：对法规文档进行分块"""
+    chunker = RegulationChunker(chunk_size=chunk_size)
+    return chunker.chunk_document(
+        markdown_text,
+        doc_id,
+        doc_name,
+        regulation_type,
+        version
+    )