Files
AIRegulation-DocAnalysis/backend/app/services/document_processor.py

405 lines
12 KiB
Python
Raw Normal View History

2026-05-14 15:07:34 +08:00
# src/services/document_processor.py
"""文档处理主流程 - 解析→摘要→分块→嵌入→入库"""
import os
from typing import List, Dict, Optional
from dataclasses import dataclass
from loguru import logger
import uuid
from .parser.pdf_parser import PDFParser
from .parser.docx_parser import DocxParser
from .parser.mineru_parser import ParserOrchestrator
from .embedding.text_chunker import RegulationChunker, TextChunk
from .embedding.bge_m3_embedder import BGEM3Embedder, EmbeddingResult
from .storage.milvus_client import MilvusClient
from .llm.document_summarizer import DocumentSummarizer, DocumentSummary
from app.config.settings import settings
@dataclass
class ProcessingResult:
"""文档处理结果"""
doc_id: str
doc_name: str
success: bool
num_chunks: int = 0
message: str = ""
markdown_text: str = ""
summary: str = ""
summary_latency_ms: int = 0
class DocumentProcessor:
"""
文档处理服务 - 完整处理流程
流程
1. 文档解析PDF/DOCX Markdown
2. 智能分块章节级+条款级
3. LLM摘要生成可选
4. 向量嵌入BGE-M3 Dense+Sparse
5. 存储入库Milvus向量数据库
"""
def __init__(
self,
chunk_size: int = None,
embedding_model: str = None,
use_mineru: bool = True,
generate_summary: bool = False, # 默认不生成摘要节省约60秒
llm_provider: str = None,
llm_model: str = None
):
"""
初始化文档处理器
Args:
chunk_size: 分块大小
embedding_model: 嵌入模型名称
use_mineru: 是否优先使用MinerU解析
generate_summary: 是否生成文档摘要默认False可节省约60秒处理时间
llm_provider: LLM提供商
llm_model: LLM模型名称
"""
self.chunk_size = chunk_size or settings.chunk_size
self.embedding_model = embedding_model or settings.embedding_model
self.use_mineru = use_mineru
self.generate_summary = generate_summary
self.llm_provider = llm_provider or settings.llm_provider
self.llm_model = llm_model or settings.llm_model
# 初始化各组件
logger.info("初始化文档处理组件...")
# 解析器
self.parser = ParserOrchestrator()
# 分块器
self.chunker = RegulationChunker(chunk_size=self.chunk_size)
# 嵌入模型(延迟加载)
self.embedder: Optional[BGEM3Embedder] = None
# Milvus客户端延迟连接
self.milvus: Optional[MilvusClient] = None
# 摘要生成器(延迟加载)
self.summarizer: Optional[DocumentSummarizer] = None
logger.success("文档处理器初始化完成")
def _init_embedder(self):
"""延迟初始化嵌入模型"""
if self.embedder is None:
logger.info("加载嵌入模型...")
self.embedder = BGEM3Embedder(model_name=self.embedding_model)
def _init_milvus(self):
"""延迟初始化Milvus连接"""
if self.milvus is None:
logger.info("连接Milvus...")
self.milvus = MilvusClient()
self.milvus.connect()
self.milvus.create_collection(recreate=False)
self.milvus.load_collection()
def _init_summarizer(self):
"""延迟初始化摘要生成器"""
if self.summarizer is None:
logger.info("初始化摘要生成器...")
self.summarizer = DocumentSummarizer(
provider=self.llm_provider,
model=self.llm_model
)
def process(
self,
file_path: str,
doc_id: Optional[str] = None,
doc_name: Optional[str] = None,
regulation_type: str = "",
version: str = ""
) -> ProcessingResult:
"""
处理单个文档
Args:
file_path: 文档文件路径
doc_id: 文档ID可选默认自动生成
doc_name: 文档名称可选默认从文件名获取
regulation_type: 法规类型
version: 文档版本
Returns:
ProcessingResult: 处理结果
"""
# 生成或使用传入的文档ID
if doc_id is None:
doc_id = str(uuid.uuid4())[:8]
# 获取文档名称
if doc_name is None:
doc_name = os.path.basename(file_path)
logger.info(f"开始处理文档: {doc_name} (ID: {doc_id})")
# 初始化结果变量
summary = ""
summary_latency_ms = 0
try:
# 1. 文档解析
logger.info("Step 1: 文档解析")
markdown_text = self._parse_document(file_path)
if not markdown_text:
return ProcessingResult(
doc_id=doc_id,
doc_name=doc_name,
success=False,
message="文档解析失败,内容为空"
)
# 2. LLM摘要生成可选
if self.generate_summary:
logger.info("Step 2: LLM摘要生成")
self._init_summarizer()
summary_result = self.summarizer.summarize(
doc_name,
markdown_text,
regulation_type
)
if summary_result.is_success:
summary = summary_result.summary
summary_latency_ms = summary_result.latency_ms
logger.success(f"摘要生成完成: {summary_latency_ms}ms")
else:
logger.warning(f"摘要生成失败: {summary_result.error}")
else:
logger.info("Step 2: 跳过摘要生成未勾选节省约60秒")
# 3. 智能分块
logger.info("Step 3: 智能分块")
chunks = self._chunk_document(
markdown_text,
doc_id,
doc_name,
regulation_type,
version
)
if not chunks:
return ProcessingResult(
doc_id=doc_id,
doc_name=doc_name,
success=False,
message="分块失败,无有效内容",
markdown_text=markdown_text,
summary=summary
)
# 4. 向量嵌入
logger.info("Step 4: 向量嵌入")
embeddings = self._embed_chunks(chunks)
if embeddings is None:
return ProcessingResult(
doc_id=doc_id,
doc_name=doc_name,
success=False,
message="向量嵌入失败",
markdown_text=markdown_text,
summary=summary
)
# 5. 存储入库
logger.info("Step 5: 存储入库")
inserted_ids = self._insert_to_milvus(chunks, embeddings)
logger.success(f"文档处理完成: {doc_name}, 共{len(inserted_ids)}条记录")
return ProcessingResult(
doc_id=doc_id,
doc_name=doc_name,
success=True,
num_chunks=len(inserted_ids),
message="处理成功",
markdown_text=markdown_text,
summary=summary,
summary_latency_ms=summary_latency_ms
)
except Exception as e:
logger.error(f"文档处理失败: {e}")
return ProcessingResult(
doc_id=doc_id,
doc_name=doc_name,
success=False,
message=f"处理失败: {str(e)}"
)
def _parse_document(self, file_path: str) -> str:
"""解析文档"""
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
# PDF文档解析优先MinerU回退PyMuPDF
markdown_text = self.parser.parse_pdf(file_path, prefer_mineru=self.use_mineru)
elif ext in [".docx", ".doc"]:
# Word文档解析
markdown_text = self.parser.parse_docx(file_path)
else:
logger.warning(f"不支持的文件类型: {ext}")
return ""
logger.success(f"文档解析完成,内容长度: {len(markdown_text)}字符")
return markdown_text
except Exception as e:
logger.error(f"文档解析失败: {e}")
return ""
def _chunk_document(
self,
markdown_text: str,
doc_id: str,
doc_name: str,
regulation_type: str,
version: str
) -> List[TextChunk]:
"""分块文档"""
try:
chunks = self.chunker.chunk_document(
markdown_text,
doc_id=doc_id,
doc_name=doc_name,
regulation_type=regulation_type,
version=version
)
logger.success(f"分块完成,共{len(chunks)}个chunk")
return chunks
except Exception as e:
logger.error(f"分块失败: {e}")
return []
def _embed_chunks(self, chunks: List[TextChunk]) -> Optional[EmbeddingResult]:
"""嵌入分块"""
try:
# 延迟初始化嵌入模型
self._init_embedder()
# 提取文本内容
texts = [chunk.content for chunk in chunks]
# 执行嵌入
embeddings = self.embedder.embed(texts)
logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}")
return embeddings
except Exception as e:
logger.error(f"嵌入失败: {e}")
return None
def _insert_to_milvus(
self,
chunks: List[TextChunk],
embeddings: EmbeddingResult
) -> List[int]:
"""插入Milvus"""
try:
# 延迟初始化Milvus
self._init_milvus()
# 执行插入
inserted_ids = self.milvus.insert_chunks(chunks, embeddings)
logger.success(f"入库完成,共{len(inserted_ids)}条记录")
return inserted_ids
except Exception as e:
logger.error(f"入库失败: {e}")
return []
def search(
self,
query: str,
top_k: int = 10,
filters: Optional[str] = None
) -> List[Dict]:
"""
检索法规内容
Args:
query: 查询文本
top_k: 返回结果数量
filters: 过滤条件
Returns:
List[Dict]: 检索结果
"""
logger.info(f"执行检索: {query}")
try:
# 延迟初始化
self._init_embedder()
self._init_milvus()
# 生成查询向量
query_embedding = self.embedder.embed_single(query)
# 执行混合检索
results = self.milvus.hybrid_search(
query_dense=query_embedding['dense'].tolist(),
query_sparse=query_embedding['sparse'],
top_k=top_k,
filters=filters
)
# 转换为字典格式
result_dicts = []
for r in results:
result_dicts.append({
"id": r.id,
"content": r.content,
"score": r.score,
"metadata": r.metadata
})
logger.success(f"检索完成,返回{len(result_dicts)}条结果")
return result_dicts
except Exception as e:
logger.error(f"检索失败: {e}")
return []
def close(self):
"""关闭连接"""
if self.milvus:
self.milvus.disconnect()
logger.info("文档处理器已关闭")
def process_document(
file_path: str,
doc_name: Optional[str] = None,
regulation_type: str = "",
version: str = ""
) -> ProcessingResult:
"""便捷函数:处理单个文档"""
processor = DocumentProcessor()
result = processor.process(file_path, doc_name, regulation_type, version)
processor.close()
return result
def search_regulations(query: str, top_k: int = 10) -> List[Dict]:
"""便捷函数:检索法规"""
processor = DocumentProcessor()
results = processor.search(query, top_k)
processor.close()
return results