Files
AIRegulation-DocAnalysis/backend/app/services/parser/mineru_parser.py

204 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""MinerU多模态PDF解析 - 版面感知解析"""
from typing import Optional, Dict
from dataclasses import dataclass, field
from loguru import logger
import os
@dataclass
class MinerUResult:
"""MinerU解析结果"""
file_path: str
markdown_text: str
metadata: Dict[str, str] = field(default_factory=dict)
success: bool = True
error_message: str = ""
class MinerUParser:
"""
MinerU多模态PDF解析器
MinerU (magic-pdf) 是一个开源的高质量PDF解析工具
支持版面感知解析,能够识别文档中的标题、正文、表格、图片等元素,
并输出结构化的Markdown格式。
GitHub: https://github.com/opendatalab/MinerU
"""
def __init__(self):
self.available = self._check_mineru_available()
def _check_mineru_available(self) -> bool:
"""检查MinerU是否可用"""
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
return True
except ImportError:
logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]")
return False
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
"""
使用MinerU解析PDF文档
Args:
file_path: PDF文件路径
output_dir: 输出目录(可选,用于保存解析产物)
Returns:
MinerUResult: 解析结果
"""
logger.info(f"尝试使用MinerU解析: {file_path}")
if not self.available:
return MinerUResult(
file_path=file_path,
markdown_text="",
success=False,
error_message="MinerU未安装"
)
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.libs.MakeContentConfig import DropMode
# 设置输出目录
if output_dir is None:
output_dir = os.path.dirname(file_path)
# 创建解析管道
# OCR模式可以根据PDF类型选择
# auto: 自动判断是否需要OCR
# txt: 纯文本PDF无OCR
# ocr: 扫描件PDFOCR
pipe = UNIPipe(file_path, output_dir)
# 执行解析
# pipe_mk() 返回Markdown格式文本
markdown_content = pipe.pipe_mk()
logger.success(f"MinerU解析成功")
return MinerUResult(
file_path=file_path,
markdown_text=markdown_content,
metadata=self._extract_metadata(pipe),
success=True
)
except Exception as e:
logger.error(f"MinerU解析失败: {e}")
return MinerUResult(
file_path=file_path,
markdown_text="",
success=False,
error_message=str(e)
)
def _extract_metadata(self, pipe) -> Dict[str, str]:
"""从解析管道提取元数据"""
metadata = {}
try:
# MinerU解析管道中可能包含的元数据信息
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
mid_data = pipe.pdf_mid_data
# 提取可能的元数据字段
metadata = {
"page_count": str(mid_data.get("page_count", "")),
"language": str(mid_data.get("language", "")),
"is_scanned": str(mid_data.get("is_scanned", "")),
}
except Exception as e:
logger.warning(f"提取MinerU元数据失败: {e}")
return metadata
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
result = self.parse(file_path)
return result.markdown_text if result.success else ""
class ParserOrchestrator:
"""
解析服务编排 - 按优先级选择解析器
解析策略:
1. 优先尝试MinerU版面感知能力强
2. MinerU失败时回退到基础PyMuPDF解析
"""
def __init__(self):
from .pdf_parser import PDFParser
self.mineru_parser = MinerUParser()
self.pdf_parser = PDFParser()
self.mineru_available = self.mineru_parser.available
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
"""
解析PDF文档按优先级选择解析器
Args:
file_path: PDF文件路径
prefer_mineru: 是否优先使用MinerU
Returns:
str: Markdown格式文本
"""
markdown_text = ""
if prefer_mineru and self.mineru_available:
# 优先尝试MinerU
result = self.mineru_parser.parse(file_path)
if result.success:
markdown_text = result.markdown_text
logger.info("使用MinerU解析成功")
return markdown_text
else:
logger.warning(f"MinerU解析失败回退到PyMuPDF: {result.error_message}")
# 回退到PyMuPDF基础解析
logger.info("使用PyMuPDF基础解析")
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
return markdown_text
def parse_docx(self, file_path: str) -> str:
"""解析Word文档"""
from .docx_parser import DocxParser
docx_parser = DocxParser()
return docx_parser.parse_to_markdown(file_path)
def parse(self, file_path: str) -> str:
"""
根据文件类型选择解析器
Args:
file_path: 文件路径
Returns:
str: Markdown格式文本
"""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return self.parse_pdf(file_path)
elif ext in [".docx", ".doc"]:
return self.parse_docx(file_path)
else:
raise ValueError(f"不支持的文件类型: {ext}")
def parse_with_mineru(file_path: str) -> MinerUResult:
"""便捷函数使用MinerU解析"""
parser = MinerUParser()
return parser.parse(file_path)
def parse_pdf_smart(file_path: str) -> str:
"""便捷函数智能解析PDF自动选择最佳解析器"""
orchestrator = ParserOrchestrator()
return orchestrator.parse_pdf(file_path)