# src/services/parser/mineru_parser.py """MinerU多模态PDF解析 - 版面感知解析""" from typing import Optional, Dict from dataclasses import dataclass, field from loguru import logger import os @dataclass class MinerUResult: """MinerU解析结果""" file_path: str markdown_text: str metadata: Dict[str, str] = field(default_factory=dict) success: bool = True error_message: str = "" class MinerUParser: """ MinerU多模态PDF解析器 MinerU (magic-pdf) 是一个开源的高质量PDF解析工具, 支持版面感知解析,能够识别文档中的标题、正文、表格、图片等元素, 并输出结构化的Markdown格式。 GitHub: https://github.com/opendatalab/MinerU """ def __init__(self): self.available = self._check_mineru_available() def _check_mineru_available(self) -> bool: """检查MinerU是否可用""" try: from magic_pdf.pipe.UNIPipe import UNIPipe return True except ImportError: logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]") return False def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult: """ 使用MinerU解析PDF文档 Args: file_path: PDF文件路径 output_dir: 输出目录(可选,用于保存解析产物) Returns: MinerUResult: 解析结果 """ logger.info(f"尝试使用MinerU解析: {file_path}") if not self.available: return MinerUResult( file_path=file_path, markdown_text="", success=False, error_message="MinerU未安装" ) try: from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.libs.MakeContentConfig import DropMode # 设置输出目录 if output_dir is None: output_dir = os.path.dirname(file_path) # 创建解析管道 # OCR模式可以根据PDF类型选择 # auto: 自动判断是否需要OCR # txt: 纯文本PDF(无OCR) # ocr: 扫描件PDF(OCR) pipe = UNIPipe(file_path, output_dir) # 执行解析 # pipe_mk() 返回Markdown格式文本 markdown_content = pipe.pipe_mk() logger.success(f"MinerU解析成功") return MinerUResult( file_path=file_path, markdown_text=markdown_content, metadata=self._extract_metadata(pipe), success=True ) except Exception as e: logger.error(f"MinerU解析失败: {e}") return MinerUResult( file_path=file_path, markdown_text="", success=False, error_message=str(e) ) def _extract_metadata(self, pipe) -> Dict[str, str]: """从解析管道提取元数据""" metadata = {} try: # MinerU解析管道中可能包含的元数据信息 if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data: mid_data = pipe.pdf_mid_data # 提取可能的元数据字段 metadata = { "page_count": str(mid_data.get("page_count", "")), "language": str(mid_data.get("language", "")), "is_scanned": str(mid_data.get("is_scanned", "")), } except Exception as e: logger.warning(f"提取MinerU元数据失败: {e}") return metadata def parse_to_markdown(self, file_path: str) -> str: """直接解析并返回Markdown文本""" result = self.parse(file_path) return result.markdown_text if result.success else "" class ParserOrchestrator: """ 解析服务编排 - 按优先级选择解析器 解析策略: 1. 优先尝试MinerU(版面感知能力强) 2. MinerU失败时回退到基础PyMuPDF解析 """ def __init__(self): from .pdf_parser import PDFParser self.mineru_parser = MinerUParser() self.pdf_parser = PDFParser() self.mineru_available = self.mineru_parser.available def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str: """ 解析PDF文档,按优先级选择解析器 Args: file_path: PDF文件路径 prefer_mineru: 是否优先使用MinerU Returns: str: Markdown格式文本 """ markdown_text = "" if prefer_mineru and self.mineru_available: # 优先尝试MinerU result = self.mineru_parser.parse(file_path) if result.success: markdown_text = result.markdown_text logger.info("使用MinerU解析成功") return markdown_text else: logger.warning(f"MinerU解析失败,回退到PyMuPDF: {result.error_message}") # 回退到PyMuPDF基础解析 logger.info("使用PyMuPDF基础解析") markdown_text = self.pdf_parser.parse_to_markdown(file_path) return markdown_text def parse_docx(self, file_path: str) -> str: """解析Word文档""" from .docx_parser import DocxParser docx_parser = DocxParser() return docx_parser.parse_to_markdown(file_path) def parse(self, file_path: str) -> str: """ 根据文件类型选择解析器 Args: file_path: 文件路径 Returns: str: Markdown格式文本 """ ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": return self.parse_pdf(file_path) elif ext in [".docx", ".doc"]: return self.parse_docx(file_path) else: raise ValueError(f"不支持的文件类型: {ext}") def parse_with_mineru(file_path: str) -> MinerUResult: """便捷函数:使用MinerU解析""" parser = MinerUParser() return parser.parse(file_path) def parse_pdf_smart(file_path: str) -> str: """便捷函数:智能解析PDF(自动选择最佳解析器)""" orchestrator = ParserOrchestrator() return orchestrator.parse_pdf(file_path)