"""文档处理主流程 - 解析→摘要→分块→嵌入→入库""" import os from typing import List, Dict, Optional from dataclasses import dataclass from loguru import logger import uuid from .parser.pdf_parser import PDFParser from .parser.docx_parser import DocxParser from .parser.mineru_parser import ParserOrchestrator from .embedding.text_chunker import RegulationChunker, TextChunk from .embedding.bge_m3_embedder import BGEM3Embedder, EmbeddingResult from .storage.milvus_client import MilvusClient from .llm.document_summarizer import DocumentSummarizer, DocumentSummary from app.config.settings import settings @dataclass class ProcessingResult: """文档处理结果""" doc_id: str doc_name: str success: bool num_chunks: int = 0 message: str = "" markdown_text: str = "" summary: str = "" summary_latency_ms: int = 0 class DocumentProcessor: """ 文档处理服务 - 完整处理流程 流程: 1. 文档解析(PDF/DOCX → Markdown) 2. 智能分块(章节级+条款级) 3. LLM摘要生成(可选) 4. 向量嵌入(BGE-M3 Dense+Sparse) 5. 存储入库(Milvus向量数据库) """ def __init__( self, chunk_size: int = None, embedding_model: str = None, use_mineru: bool = True, generate_summary: bool = False, # 默认不生成摘要,节省约60秒 llm_provider: str = None, llm_model: str = None ): """ 初始化文档处理器 Args: chunk_size: 分块大小 embedding_model: 嵌入模型名称 use_mineru: 是否优先使用MinerU解析 generate_summary: 是否生成文档摘要(默认False,可节省约60秒处理时间) llm_provider: LLM提供商 llm_model: LLM模型名称 """ self.chunk_size = chunk_size or settings.chunk_size self.embedding_model = embedding_model or settings.embedding_model self.use_mineru = use_mineru self.generate_summary = generate_summary self.llm_provider = llm_provider or settings.llm_provider self.llm_model = llm_model or settings.llm_model # 初始化各组件 logger.info("初始化文档处理组件...") # 解析器 self.parser = ParserOrchestrator() # 分块器 self.chunker = RegulationChunker(chunk_size=self.chunk_size) # 嵌入模型(延迟加载) self.embedder: Optional[BGEM3Embedder] = None # Milvus客户端(延迟连接) self.milvus: Optional[MilvusClient] = None # 摘要生成器(延迟加载) self.summarizer: Optional[DocumentSummarizer] = None logger.success("文档处理器初始化完成") def _init_embedder(self): """延迟初始化嵌入模型""" if self.embedder is None: logger.info("加载嵌入模型...") self.embedder = BGEM3Embedder(model_name=self.embedding_model) def _init_milvus(self): """延迟初始化Milvus连接""" if self.milvus is None: logger.info("连接Milvus...") self.milvus = MilvusClient() self.milvus.connect() self.milvus.create_collection(recreate=False) self.milvus.load_collection() def _init_summarizer(self): """延迟初始化摘要生成器""" if self.summarizer is None: logger.info("初始化摘要生成器...") self.summarizer = DocumentSummarizer( provider=self.llm_provider, model=self.llm_model ) def process( self, file_path: str, doc_id: Optional[str] = None, doc_name: Optional[str] = None, regulation_type: str = "", version: str = "" ) -> ProcessingResult: """ 处理单个文档 Args: file_path: 文档文件路径 doc_id: 文档ID(可选,默认自动生成) doc_name: 文档名称(可选,默认从文件名获取) regulation_type: 法规类型 version: 文档版本 Returns: ProcessingResult: 处理结果 """ # 生成或使用传入的文档ID if doc_id is None: doc_id = str(uuid.uuid4())[:8] # 获取文档名称 if doc_name is None: doc_name = os.path.basename(file_path) logger.info(f"开始处理文档: {doc_name} (ID: {doc_id})") # 初始化结果变量 summary = "" summary_latency_ms = 0 try: # 1. 文档解析 logger.info("Step 1: 文档解析") markdown_text = self._parse_document(file_path) if not markdown_text: return ProcessingResult( doc_id=doc_id, doc_name=doc_name, success=False, message="文档解析失败,内容为空" ) # 2. LLM摘要生成(可选) if self.generate_summary: logger.info("Step 2: LLM摘要生成") self._init_summarizer() summary_result = self.summarizer.summarize( doc_name, markdown_text, regulation_type ) if summary_result.is_success: summary = summary_result.summary summary_latency_ms = summary_result.latency_ms logger.success(f"摘要生成完成: {summary_latency_ms}ms") else: logger.warning(f"摘要生成失败: {summary_result.error}") else: logger.info("Step 2: 跳过摘要生成(未勾选,节省约60秒)") # 3. 智能分块 logger.info("Step 3: 智能分块") chunks = self._chunk_document( markdown_text, doc_id, doc_name, regulation_type, version ) if not chunks: return ProcessingResult( doc_id=doc_id, doc_name=doc_name, success=False, message="分块失败,无有效内容", markdown_text=markdown_text, summary=summary ) # 4. 向量嵌入 logger.info("Step 4: 向量嵌入") embeddings = self._embed_chunks(chunks) if embeddings is None: return ProcessingResult( doc_id=doc_id, doc_name=doc_name, success=False, message="向量嵌入失败", markdown_text=markdown_text, summary=summary ) # 5. 存储入库 logger.info("Step 5: 存储入库") inserted_ids = self._insert_to_milvus(chunks, embeddings) logger.success(f"文档处理完成: {doc_name}, 共{len(inserted_ids)}条记录") return ProcessingResult( doc_id=doc_id, doc_name=doc_name, success=True, num_chunks=len(inserted_ids), message="处理成功", markdown_text=markdown_text, summary=summary, summary_latency_ms=summary_latency_ms ) except Exception as e: logger.error(f"文档处理失败: {e}") return ProcessingResult( doc_id=doc_id, doc_name=doc_name, success=False, message=f"处理失败: {str(e)}" ) def _parse_document(self, file_path: str) -> str: """解析文档""" ext = os.path.splitext(file_path)[1].lower() try: if ext == ".pdf": # PDF文档解析(优先MinerU,回退PyMuPDF) markdown_text = self.parser.parse_pdf(file_path, prefer_mineru=self.use_mineru) elif ext in [".docx", ".doc"]: # Word文档解析 markdown_text = self.parser.parse_docx(file_path) else: logger.warning(f"不支持的文件类型: {ext}") return "" logger.success(f"文档解析完成,内容长度: {len(markdown_text)}字符") return markdown_text except Exception as e: logger.error(f"文档解析失败: {e}") return "" def _chunk_document( self, markdown_text: str, doc_id: str, doc_name: str, regulation_type: str, version: str ) -> List[TextChunk]: """分块文档""" try: chunks = self.chunker.chunk_document( markdown_text, doc_id=doc_id, doc_name=doc_name, regulation_type=regulation_type, version=version ) logger.success(f"分块完成,共{len(chunks)}个chunk") return chunks except Exception as e: logger.error(f"分块失败: {e}") return [] def _embed_chunks(self, chunks: List[TextChunk]) -> Optional[EmbeddingResult]: """嵌入分块""" try: # 延迟初始化嵌入模型 self._init_embedder() # 提取文本内容 texts = [chunk.content for chunk in chunks] # 执行嵌入 embeddings = self.embedder.embed(texts) logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}") return embeddings except Exception as e: logger.error(f"嵌入失败: {e}") return None def _insert_to_milvus( self, chunks: List[TextChunk], embeddings: EmbeddingResult ) -> List[int]: """插入Milvus""" try: # 延迟初始化Milvus self._init_milvus() # 执行插入 inserted_ids = self.milvus.insert_chunks(chunks, embeddings) logger.success(f"入库完成,共{len(inserted_ids)}条记录") return inserted_ids except Exception as e: logger.error(f"入库失败: {e}") return [] def search( self, query: str, top_k: int = 10, filters: Optional[str] = None ) -> List[Dict]: """ 检索法规内容 Args: query: 查询文本 top_k: 返回结果数量 filters: 过滤条件 Returns: List[Dict]: 检索结果 """ logger.info(f"执行检索: {query}") try: # 延迟初始化 self._init_embedder() self._init_milvus() # 生成查询向量 query_embedding = self.embedder.embed_single(query) # 执行混合检索 results = self.milvus.hybrid_search( query_dense=query_embedding['dense'].tolist(), query_sparse=query_embedding['sparse'], top_k=top_k, filters=filters ) # 转换为字典格式 result_dicts = [] for r in results: result_dicts.append({ "id": r.id, "content": r.content, "score": r.score, "metadata": r.metadata }) logger.success(f"检索完成,返回{len(result_dicts)}条结果") return result_dicts except Exception as e: logger.error(f"检索失败: {e}") return [] def close(self): """关闭连接""" if self.milvus: self.milvus.disconnect() logger.info("文档处理器已关闭") def process_document( file_path: str, doc_name: Optional[str] = None, regulation_type: str = "", version: str = "" ) -> ProcessingResult: """便捷函数:处理单个文档""" processor = DocumentProcessor() result = processor.process(file_path, doc_name, regulation_type, version) processor.close() return result def search_regulations(query: str, top_k: int = 10) -> List[Dict]: """便捷函数:检索法规""" processor = DocumentProcessor() results = processor.search(query, top_k) processor.close() return results