Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,14 +1,16 @@
|
||||
"""MinerU多模态PDF解析 - 版面感知解析"""
|
||||
"""Provide service-layer logic for mineru parser."""
|
||||
|
||||
from typing import Optional, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
import os
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class MinerUResult:
|
||||
"""MinerU解析结果"""
|
||||
"""Represent the Miner U Result type."""
|
||||
file_path: str
|
||||
markdown_text: str
|
||||
metadata: Dict[str, str] = field(default_factory=dict)
|
||||
@@ -17,21 +19,14 @@ class MinerUResult:
|
||||
|
||||
|
||||
class MinerUParser:
|
||||
"""
|
||||
MinerU多模态PDF解析器
|
||||
|
||||
MinerU (magic-pdf) 是一个开源的高质量PDF解析工具,
|
||||
支持版面感知解析,能够识别文档中的标题、正文、表格、图片等元素,
|
||||
并输出结构化的Markdown格式。
|
||||
|
||||
GitHub: https://github.com/opendatalab/MinerU
|
||||
"""
|
||||
"""Provide the Miner U Parser parser."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Miner U Parser instance."""
|
||||
self.available = self._check_mineru_available()
|
||||
|
||||
def _check_mineru_available(self) -> bool:
|
||||
"""检查MinerU是否可用"""
|
||||
"""Handle check mineru available for this module for the Miner U Parser instance."""
|
||||
try:
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
return True
|
||||
@@ -40,16 +35,7 @@ class MinerUParser:
|
||||
return False
|
||||
|
||||
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
|
||||
"""
|
||||
使用MinerU解析PDF文档
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
output_dir: 输出目录(可选,用于保存解析产物)
|
||||
|
||||
Returns:
|
||||
MinerUResult: 解析结果
|
||||
"""
|
||||
"""Handle parse for the Miner U Parser instance."""
|
||||
logger.info(f"尝试使用MinerU解析: {file_path}")
|
||||
|
||||
if not self.available:
|
||||
@@ -64,19 +50,19 @@ class MinerUParser:
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
|
||||
# 设置输出目录
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if output_dir is None:
|
||||
output_dir = os.path.dirname(file_path)
|
||||
|
||||
# 创建解析管道
|
||||
# OCR模式可以根据PDF类型选择
|
||||
# auto: 自动判断是否需要OCR
|
||||
# txt: 纯文本PDF(无OCR)
|
||||
# ocr: 扫描件PDF(OCR)
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
pipe = UNIPipe(file_path, output_dir)
|
||||
|
||||
# 执行解析
|
||||
# pipe_mk() 返回Markdown格式文本
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
markdown_content = pipe.pipe_mk()
|
||||
|
||||
logger.success(f"MinerU解析成功")
|
||||
@@ -98,13 +84,13 @@ class MinerUParser:
|
||||
)
|
||||
|
||||
def _extract_metadata(self, pipe) -> Dict[str, str]:
|
||||
"""从解析管道提取元数据"""
|
||||
"""Handle extract metadata for this module for the Miner U Parser instance."""
|
||||
metadata = {}
|
||||
try:
|
||||
# MinerU解析管道中可能包含的元数据信息
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
|
||||
mid_data = pipe.pdf_mid_data
|
||||
# 提取可能的元数据字段
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
metadata = {
|
||||
"page_count": str(mid_data.get("page_count", "")),
|
||||
"language": str(mid_data.get("language", "")),
|
||||
@@ -116,41 +102,27 @@ class MinerUParser:
|
||||
return metadata
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""直接解析并返回Markdown文本"""
|
||||
"""Parse to markdown for the Miner U Parser instance."""
|
||||
result = self.parse(file_path)
|
||||
return result.markdown_text if result.success else ""
|
||||
|
||||
|
||||
class ParserOrchestrator:
|
||||
"""
|
||||
解析服务编排 - 按优先级选择解析器
|
||||
|
||||
解析策略:
|
||||
1. 优先尝试MinerU(版面感知能力强)
|
||||
2. MinerU失败时回退到基础PyMuPDF解析
|
||||
"""
|
||||
"""Represent the Parser Orchestrator type."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the Parser Orchestrator instance."""
|
||||
from .pdf_parser import PDFParser
|
||||
self.mineru_parser = MinerUParser()
|
||||
self.pdf_parser = PDFParser()
|
||||
self.mineru_available = self.mineru_parser.available
|
||||
|
||||
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
|
||||
"""
|
||||
解析PDF文档,按优先级选择解析器
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
prefer_mineru: 是否优先使用MinerU
|
||||
|
||||
Returns:
|
||||
str: Markdown格式文本
|
||||
"""
|
||||
"""Parse pdf for the Parser Orchestrator instance."""
|
||||
markdown_text = ""
|
||||
|
||||
if prefer_mineru and self.mineru_available:
|
||||
# 优先尝试MinerU
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
result = self.mineru_parser.parse(file_path)
|
||||
if result.success:
|
||||
markdown_text = result.markdown_text
|
||||
@@ -159,28 +131,20 @@ class ParserOrchestrator:
|
||||
else:
|
||||
logger.warning(f"MinerU解析失败,回退到PyMuPDF: {result.error_message}")
|
||||
|
||||
# 回退到PyMuPDF基础解析
|
||||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||||
logger.info("使用PyMuPDF基础解析")
|
||||
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
|
||||
|
||||
return markdown_text
|
||||
|
||||
def parse_docx(self, file_path: str) -> str:
|
||||
"""解析Word文档"""
|
||||
"""Parse docx for the Parser Orchestrator instance."""
|
||||
from .docx_parser import DocxParser
|
||||
docx_parser = DocxParser()
|
||||
return docx_parser.parse_to_markdown(file_path)
|
||||
|
||||
def parse(self, file_path: str) -> str:
|
||||
"""
|
||||
根据文件类型选择解析器
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
str: Markdown格式文本
|
||||
"""
|
||||
"""Handle parse for the Parser Orchestrator instance."""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == ".pdf":
|
||||
@@ -192,12 +156,12 @@ class ParserOrchestrator:
|
||||
|
||||
|
||||
def parse_with_mineru(file_path: str) -> MinerUResult:
|
||||
"""便捷函数:使用MinerU解析"""
|
||||
"""Parse with mineru."""
|
||||
parser = MinerUParser()
|
||||
return parser.parse(file_path)
|
||||
|
||||
|
||||
def parse_pdf_smart(file_path: str) -> str:
|
||||
"""便捷函数:智能解析PDF(自动选择最佳解析器)"""
|
||||
"""Parse pdf smart."""
|
||||
orchestrator = ParserOrchestrator()
|
||||
return orchestrator.parse_pdf(file_path)
|
||||
|
||||
Reference in New Issue
Block a user