405 lines
12 KiB
Python
405 lines
12 KiB
Python
# src/services/document_processor.py
|
||
"""文档处理主流程 - 解析→摘要→分块→嵌入→入库"""
|
||
|
||
import os
|
||
from typing import List, Dict, Optional
|
||
from dataclasses import dataclass
|
||
from loguru import logger
|
||
import uuid
|
||
|
||
from .parser.pdf_parser import PDFParser
|
||
from .parser.docx_parser import DocxParser
|
||
from .parser.mineru_parser import ParserOrchestrator
|
||
from .embedding.text_chunker import RegulationChunker, TextChunk
|
||
from .embedding.bge_m3_embedder import BGEM3Embedder, EmbeddingResult
|
||
from .storage.milvus_client import MilvusClient
|
||
from .llm.document_summarizer import DocumentSummarizer, DocumentSummary
|
||
from app.config.settings import settings
|
||
|
||
|
||
@dataclass
|
||
class ProcessingResult:
|
||
"""文档处理结果"""
|
||
doc_id: str
|
||
doc_name: str
|
||
success: bool
|
||
num_chunks: int = 0
|
||
message: str = ""
|
||
markdown_text: str = ""
|
||
summary: str = ""
|
||
summary_latency_ms: int = 0
|
||
|
||
|
||
class DocumentProcessor:
|
||
"""
|
||
文档处理服务 - 完整处理流程
|
||
|
||
流程:
|
||
1. 文档解析(PDF/DOCX → Markdown)
|
||
2. 智能分块(章节级+条款级)
|
||
3. LLM摘要生成(可选)
|
||
4. 向量嵌入(BGE-M3 Dense+Sparse)
|
||
5. 存储入库(Milvus向量数据库)
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
chunk_size: int = None,
|
||
embedding_model: str = None,
|
||
use_mineru: bool = True,
|
||
generate_summary: bool = False, # 默认不生成摘要,节省约60秒
|
||
llm_provider: str = None,
|
||
llm_model: str = None
|
||
):
|
||
"""
|
||
初始化文档处理器
|
||
|
||
Args:
|
||
chunk_size: 分块大小
|
||
embedding_model: 嵌入模型名称
|
||
use_mineru: 是否优先使用MinerU解析
|
||
generate_summary: 是否生成文档摘要(默认False,可节省约60秒处理时间)
|
||
llm_provider: LLM提供商
|
||
llm_model: LLM模型名称
|
||
"""
|
||
self.chunk_size = chunk_size or settings.chunk_size
|
||
self.embedding_model = embedding_model or settings.embedding_model
|
||
self.use_mineru = use_mineru
|
||
self.generate_summary = generate_summary
|
||
self.llm_provider = llm_provider or settings.llm_provider
|
||
self.llm_model = llm_model or settings.llm_model
|
||
|
||
# 初始化各组件
|
||
logger.info("初始化文档处理组件...")
|
||
|
||
# 解析器
|
||
self.parser = ParserOrchestrator()
|
||
|
||
# 分块器
|
||
self.chunker = RegulationChunker(chunk_size=self.chunk_size)
|
||
|
||
# 嵌入模型(延迟加载)
|
||
self.embedder: Optional[BGEM3Embedder] = None
|
||
|
||
# Milvus客户端(延迟连接)
|
||
self.milvus: Optional[MilvusClient] = None
|
||
|
||
# 摘要生成器(延迟加载)
|
||
self.summarizer: Optional[DocumentSummarizer] = None
|
||
|
||
logger.success("文档处理器初始化完成")
|
||
|
||
def _init_embedder(self):
|
||
"""延迟初始化嵌入模型"""
|
||
if self.embedder is None:
|
||
logger.info("加载嵌入模型...")
|
||
self.embedder = BGEM3Embedder(model_name=self.embedding_model)
|
||
|
||
def _init_milvus(self):
|
||
"""延迟初始化Milvus连接"""
|
||
if self.milvus is None:
|
||
logger.info("连接Milvus...")
|
||
self.milvus = MilvusClient()
|
||
self.milvus.connect()
|
||
self.milvus.create_collection(recreate=False)
|
||
self.milvus.load_collection()
|
||
|
||
def _init_summarizer(self):
|
||
"""延迟初始化摘要生成器"""
|
||
if self.summarizer is None:
|
||
logger.info("初始化摘要生成器...")
|
||
self.summarizer = DocumentSummarizer(
|
||
provider=self.llm_provider,
|
||
model=self.llm_model
|
||
)
|
||
|
||
def process(
|
||
self,
|
||
file_path: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_name: Optional[str] = None,
|
||
regulation_type: str = "",
|
||
version: str = ""
|
||
) -> ProcessingResult:
|
||
"""
|
||
处理单个文档
|
||
|
||
Args:
|
||
file_path: 文档文件路径
|
||
doc_id: 文档ID(可选,默认自动生成)
|
||
doc_name: 文档名称(可选,默认从文件名获取)
|
||
regulation_type: 法规类型
|
||
version: 文档版本
|
||
|
||
Returns:
|
||
ProcessingResult: 处理结果
|
||
"""
|
||
# 生成或使用传入的文档ID
|
||
if doc_id is None:
|
||
doc_id = str(uuid.uuid4())[:8]
|
||
|
||
# 获取文档名称
|
||
if doc_name is None:
|
||
doc_name = os.path.basename(file_path)
|
||
|
||
logger.info(f"开始处理文档: {doc_name} (ID: {doc_id})")
|
||
|
||
# 初始化结果变量
|
||
summary = ""
|
||
summary_latency_ms = 0
|
||
|
||
try:
|
||
# 1. 文档解析
|
||
logger.info("Step 1: 文档解析")
|
||
markdown_text = self._parse_document(file_path)
|
||
|
||
if not markdown_text:
|
||
return ProcessingResult(
|
||
doc_id=doc_id,
|
||
doc_name=doc_name,
|
||
success=False,
|
||
message="文档解析失败,内容为空"
|
||
)
|
||
|
||
# 2. LLM摘要生成(可选)
|
||
if self.generate_summary:
|
||
logger.info("Step 2: LLM摘要生成")
|
||
self._init_summarizer()
|
||
summary_result = self.summarizer.summarize(
|
||
doc_name,
|
||
markdown_text,
|
||
regulation_type
|
||
)
|
||
if summary_result.is_success:
|
||
summary = summary_result.summary
|
||
summary_latency_ms = summary_result.latency_ms
|
||
logger.success(f"摘要生成完成: {summary_latency_ms}ms")
|
||
else:
|
||
logger.warning(f"摘要生成失败: {summary_result.error}")
|
||
else:
|
||
logger.info("Step 2: 跳过摘要生成(未勾选,节省约60秒)")
|
||
|
||
# 3. 智能分块
|
||
logger.info("Step 3: 智能分块")
|
||
chunks = self._chunk_document(
|
||
markdown_text,
|
||
doc_id,
|
||
doc_name,
|
||
regulation_type,
|
||
version
|
||
)
|
||
|
||
if not chunks:
|
||
return ProcessingResult(
|
||
doc_id=doc_id,
|
||
doc_name=doc_name,
|
||
success=False,
|
||
message="分块失败,无有效内容",
|
||
markdown_text=markdown_text,
|
||
summary=summary
|
||
)
|
||
|
||
# 4. 向量嵌入
|
||
logger.info("Step 4: 向量嵌入")
|
||
embeddings = self._embed_chunks(chunks)
|
||
|
||
if embeddings is None:
|
||
return ProcessingResult(
|
||
doc_id=doc_id,
|
||
doc_name=doc_name,
|
||
success=False,
|
||
message="向量嵌入失败",
|
||
markdown_text=markdown_text,
|
||
summary=summary
|
||
)
|
||
|
||
# 5. 存储入库
|
||
logger.info("Step 5: 存储入库")
|
||
inserted_ids = self._insert_to_milvus(chunks, embeddings)
|
||
|
||
logger.success(f"文档处理完成: {doc_name}, 共{len(inserted_ids)}条记录")
|
||
|
||
return ProcessingResult(
|
||
doc_id=doc_id,
|
||
doc_name=doc_name,
|
||
success=True,
|
||
num_chunks=len(inserted_ids),
|
||
message="处理成功",
|
||
markdown_text=markdown_text,
|
||
summary=summary,
|
||
summary_latency_ms=summary_latency_ms
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"文档处理失败: {e}")
|
||
return ProcessingResult(
|
||
doc_id=doc_id,
|
||
doc_name=doc_name,
|
||
success=False,
|
||
message=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
def _parse_document(self, file_path: str) -> str:
|
||
"""解析文档"""
|
||
ext = os.path.splitext(file_path)[1].lower()
|
||
|
||
try:
|
||
if ext == ".pdf":
|
||
# PDF文档解析(优先MinerU,回退PyMuPDF)
|
||
markdown_text = self.parser.parse_pdf(file_path, prefer_mineru=self.use_mineru)
|
||
elif ext in [".docx", ".doc"]:
|
||
# Word文档解析
|
||
markdown_text = self.parser.parse_docx(file_path)
|
||
else:
|
||
logger.warning(f"不支持的文件类型: {ext}")
|
||
return ""
|
||
|
||
logger.success(f"文档解析完成,内容长度: {len(markdown_text)}字符")
|
||
return markdown_text
|
||
|
||
except Exception as e:
|
||
logger.error(f"文档解析失败: {e}")
|
||
return ""
|
||
|
||
def _chunk_document(
|
||
self,
|
||
markdown_text: str,
|
||
doc_id: str,
|
||
doc_name: str,
|
||
regulation_type: str,
|
||
version: str
|
||
) -> List[TextChunk]:
|
||
"""分块文档"""
|
||
try:
|
||
chunks = self.chunker.chunk_document(
|
||
markdown_text,
|
||
doc_id=doc_id,
|
||
doc_name=doc_name,
|
||
regulation_type=regulation_type,
|
||
version=version
|
||
)
|
||
logger.success(f"分块完成,共{len(chunks)}个chunk")
|
||
return chunks
|
||
|
||
except Exception as e:
|
||
logger.error(f"分块失败: {e}")
|
||
return []
|
||
|
||
def _embed_chunks(self, chunks: List[TextChunk]) -> Optional[EmbeddingResult]:
|
||
"""嵌入分块"""
|
||
try:
|
||
# 延迟初始化嵌入模型
|
||
self._init_embedder()
|
||
|
||
# 提取文本内容
|
||
texts = [chunk.content for chunk in chunks]
|
||
|
||
# 执行嵌入
|
||
embeddings = self.embedder.embed(texts)
|
||
|
||
logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}")
|
||
return embeddings
|
||
|
||
except Exception as e:
|
||
logger.error(f"嵌入失败: {e}")
|
||
return None
|
||
|
||
def _insert_to_milvus(
|
||
self,
|
||
chunks: List[TextChunk],
|
||
embeddings: EmbeddingResult
|
||
) -> List[int]:
|
||
"""插入Milvus"""
|
||
try:
|
||
# 延迟初始化Milvus
|
||
self._init_milvus()
|
||
|
||
# 执行插入
|
||
inserted_ids = self.milvus.insert_chunks(chunks, embeddings)
|
||
|
||
logger.success(f"入库完成,共{len(inserted_ids)}条记录")
|
||
return inserted_ids
|
||
|
||
except Exception as e:
|
||
logger.error(f"入库失败: {e}")
|
||
return []
|
||
|
||
def search(
|
||
self,
|
||
query: str,
|
||
top_k: int = 10,
|
||
filters: Optional[str] = None
|
||
) -> List[Dict]:
|
||
"""
|
||
检索法规内容
|
||
|
||
Args:
|
||
query: 查询文本
|
||
top_k: 返回结果数量
|
||
filters: 过滤条件
|
||
|
||
Returns:
|
||
List[Dict]: 检索结果
|
||
"""
|
||
logger.info(f"执行检索: {query}")
|
||
|
||
try:
|
||
# 延迟初始化
|
||
self._init_embedder()
|
||
self._init_milvus()
|
||
|
||
# 生成查询向量
|
||
query_embedding = self.embedder.embed_single(query)
|
||
|
||
# 执行混合检索
|
||
results = self.milvus.hybrid_search(
|
||
query_dense=query_embedding['dense'].tolist(),
|
||
query_sparse=query_embedding['sparse'],
|
||
top_k=top_k,
|
||
filters=filters
|
||
)
|
||
|
||
# 转换为字典格式
|
||
result_dicts = []
|
||
for r in results:
|
||
result_dicts.append({
|
||
"id": r.id,
|
||
"content": r.content,
|
||
"score": r.score,
|
||
"metadata": r.metadata
|
||
})
|
||
|
||
logger.success(f"检索完成,返回{len(result_dicts)}条结果")
|
||
return result_dicts
|
||
|
||
except Exception as e:
|
||
logger.error(f"检索失败: {e}")
|
||
return []
|
||
|
||
def close(self):
|
||
"""关闭连接"""
|
||
if self.milvus:
|
||
self.milvus.disconnect()
|
||
logger.info("文档处理器已关闭")
|
||
|
||
|
||
def process_document(
|
||
file_path: str,
|
||
doc_name: Optional[str] = None,
|
||
regulation_type: str = "",
|
||
version: str = ""
|
||
) -> ProcessingResult:
|
||
"""便捷函数:处理单个文档"""
|
||
processor = DocumentProcessor()
|
||
result = processor.process(file_path, doc_name, regulation_type, version)
|
||
processor.close()
|
||
return result
|
||
|
||
|
||
def search_regulations(query: str, top_k: int = 10) -> List[Dict]:
|
||
"""便捷函数:检索法规"""
|
||
processor = DocumentProcessor()
|
||
results = processor.search(query, top_k)
|
||
processor.close()
|
||
return results
|