This commit is contained in:
2026-05-14 15:07:34 +08:00
parent c2a398930d
commit 10d04c4083
179 changed files with 24073 additions and 1243 deletions

View File

@@ -0,0 +1,204 @@
# src/services/parser/mineru_parser.py
"""MinerU多模态PDF解析 - 版面感知解析"""
from typing import Optional, Dict
from dataclasses import dataclass, field
from loguru import logger
import os
@dataclass
class MinerUResult:
"""MinerU解析结果"""
file_path: str
markdown_text: str
metadata: Dict[str, str] = field(default_factory=dict)
success: bool = True
error_message: str = ""
class MinerUParser:
"""
MinerU多模态PDF解析器
MinerU (magic-pdf) 是一个开源的高质量PDF解析工具
支持版面感知解析,能够识别文档中的标题、正文、表格、图片等元素,
并输出结构化的Markdown格式。
GitHub: https://github.com/opendatalab/MinerU
"""
def __init__(self):
self.available = self._check_mineru_available()
def _check_mineru_available(self) -> bool:
"""检查MinerU是否可用"""
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
return True
except ImportError:
logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]")
return False
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
"""
使用MinerU解析PDF文档
Args:
file_path: PDF文件路径
output_dir: 输出目录(可选,用于保存解析产物)
Returns:
MinerUResult: 解析结果
"""
logger.info(f"尝试使用MinerU解析: {file_path}")
if not self.available:
return MinerUResult(
file_path=file_path,
markdown_text="",
success=False,
error_message="MinerU未安装"
)
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.libs.MakeContentConfig import DropMode
# 设置输出目录
if output_dir is None:
output_dir = os.path.dirname(file_path)
# 创建解析管道
# OCR模式可以根据PDF类型选择
# auto: 自动判断是否需要OCR
# txt: 纯文本PDF无OCR
# ocr: 扫描件PDFOCR
pipe = UNIPipe(file_path, output_dir)
# 执行解析
# pipe_mk() 返回Markdown格式文本
markdown_content = pipe.pipe_mk()
logger.success(f"MinerU解析成功")
return MinerUResult(
file_path=file_path,
markdown_text=markdown_content,
metadata=self._extract_metadata(pipe),
success=True
)
except Exception as e:
logger.error(f"MinerU解析失败: {e}")
return MinerUResult(
file_path=file_path,
markdown_text="",
success=False,
error_message=str(e)
)
def _extract_metadata(self, pipe) -> Dict[str, str]:
"""从解析管道提取元数据"""
metadata = {}
try:
# MinerU解析管道中可能包含的元数据信息
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
mid_data = pipe.pdf_mid_data
# 提取可能的元数据字段
metadata = {
"page_count": str(mid_data.get("page_count", "")),
"language": str(mid_data.get("language", "")),
"is_scanned": str(mid_data.get("is_scanned", "")),
}
except Exception as e:
logger.warning(f"提取MinerU元数据失败: {e}")
return metadata
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
result = self.parse(file_path)
return result.markdown_text if result.success else ""
class ParserOrchestrator:
"""
解析服务编排 - 按优先级选择解析器
解析策略:
1. 优先尝试MinerU版面感知能力强
2. MinerU失败时回退到基础PyMuPDF解析
"""
def __init__(self):
from .pdf_parser import PDFParser
self.mineru_parser = MinerUParser()
self.pdf_parser = PDFParser()
self.mineru_available = self.mineru_parser.available
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
"""
解析PDF文档按优先级选择解析器
Args:
file_path: PDF文件路径
prefer_mineru: 是否优先使用MinerU
Returns:
str: Markdown格式文本
"""
markdown_text = ""
if prefer_mineru and self.mineru_available:
# 优先尝试MinerU
result = self.mineru_parser.parse(file_path)
if result.success:
markdown_text = result.markdown_text
logger.info("使用MinerU解析成功")
return markdown_text
else:
logger.warning(f"MinerU解析失败回退到PyMuPDF: {result.error_message}")
# 回退到PyMuPDF基础解析
logger.info("使用PyMuPDF基础解析")
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
return markdown_text
def parse_docx(self, file_path: str) -> str:
"""解析Word文档"""
from .docx_parser import DocxParser
docx_parser = DocxParser()
return docx_parser.parse_to_markdown(file_path)
def parse(self, file_path: str) -> str:
"""
根据文件类型选择解析器
Args:
file_path: 文件路径
Returns:
str: Markdown格式文本
"""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return self.parse_pdf(file_path)
elif ext in [".docx", ".doc"]:
return self.parse_docx(file_path)
else:
raise ValueError(f"不支持的文件类型: {ext}")
def parse_with_mineru(file_path: str) -> MinerUResult:
"""便捷函数使用MinerU解析"""
parser = MinerUParser()
return parser.parse(file_path)
def parse_pdf_smart(file_path: str) -> str:
"""便捷函数智能解析PDF自动选择最佳解析器"""
orchestrator = ParserOrchestrator()
return orchestrator.parse_pdf(file_path)