168 lines
6.3 KiB
Python
168 lines
6.3 KiB
Python
"""Provide service-layer logic for mineru parser."""
|
||
|
||
from typing import Optional, Dict
|
||
from dataclasses import dataclass, field
|
||
from loguru import logger
|
||
import os
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
|
||
|
||
|
||
@dataclass
|
||
class MinerUResult:
|
||
"""Represent the Miner U Result type."""
|
||
file_path: str
|
||
markdown_text: str
|
||
metadata: Dict[str, str] = field(default_factory=dict)
|
||
success: bool = True
|
||
error_message: str = ""
|
||
|
||
|
||
class MinerUParser:
|
||
"""Provide the Miner U Parser parser."""
|
||
|
||
def __init__(self):
|
||
"""Initialize the Miner U Parser instance."""
|
||
self.available = self._check_mineru_available()
|
||
|
||
def _check_mineru_available(self) -> bool:
|
||
"""Handle check mineru available for this module for the Miner U Parser instance."""
|
||
try:
|
||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||
return True
|
||
except ImportError:
|
||
logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]")
|
||
return False
|
||
|
||
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
|
||
"""Handle parse for the Miner U Parser instance."""
|
||
logger.info(f"尝试使用MinerU解析: {file_path}")
|
||
|
||
if not self.available:
|
||
return MinerUResult(
|
||
file_path=file_path,
|
||
markdown_text="",
|
||
success=False,
|
||
error_message="MinerU未安装"
|
||
)
|
||
|
||
try:
|
||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
if output_dir is None:
|
||
output_dir = os.path.dirname(file_path)
|
||
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
pipe = UNIPipe(file_path, output_dir)
|
||
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
markdown_content = pipe.pipe_mk()
|
||
|
||
logger.success(f"MinerU解析成功")
|
||
|
||
return MinerUResult(
|
||
file_path=file_path,
|
||
markdown_text=markdown_content,
|
||
metadata=self._extract_metadata(pipe),
|
||
success=True
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"MinerU解析失败: {e}")
|
||
return MinerUResult(
|
||
file_path=file_path,
|
||
markdown_text="",
|
||
success=False,
|
||
error_message=str(e)
|
||
)
|
||
|
||
def _extract_metadata(self, pipe) -> Dict[str, str]:
|
||
"""Handle extract metadata for this module for the Miner U Parser instance."""
|
||
metadata = {}
|
||
try:
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
|
||
mid_data = pipe.pdf_mid_data
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
metadata = {
|
||
"page_count": str(mid_data.get("page_count", "")),
|
||
"language": str(mid_data.get("language", "")),
|
||
"is_scanned": str(mid_data.get("is_scanned", "")),
|
||
}
|
||
except Exception as e:
|
||
logger.warning(f"提取MinerU元数据失败: {e}")
|
||
|
||
return metadata
|
||
|
||
def parse_to_markdown(self, file_path: str) -> str:
|
||
"""Parse to markdown for the Miner U Parser instance."""
|
||
result = self.parse(file_path)
|
||
return result.markdown_text if result.success else ""
|
||
|
||
|
||
class ParserOrchestrator:
|
||
"""Represent the Parser Orchestrator type."""
|
||
|
||
def __init__(self):
|
||
"""Initialize the Parser Orchestrator instance."""
|
||
from .pdf_parser import PDFParser
|
||
self.mineru_parser = MinerUParser()
|
||
self.pdf_parser = PDFParser()
|
||
self.mineru_available = self.mineru_parser.available
|
||
|
||
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
|
||
"""Parse pdf for the Parser Orchestrator instance."""
|
||
markdown_text = ""
|
||
|
||
if prefer_mineru and self.mineru_available:
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
result = self.mineru_parser.parse(file_path)
|
||
if result.success:
|
||
markdown_text = result.markdown_text
|
||
logger.info("使用MinerU解析成功")
|
||
return markdown_text
|
||
else:
|
||
logger.warning(f"MinerU解析失败,回退到PyMuPDF: {result.error_message}")
|
||
|
||
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
||
logger.info("使用PyMuPDF基础解析")
|
||
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
|
||
|
||
return markdown_text
|
||
|
||
def parse_docx(self, file_path: str) -> str:
|
||
"""Parse docx for the Parser Orchestrator instance."""
|
||
from .docx_parser import DocxParser
|
||
docx_parser = DocxParser()
|
||
return docx_parser.parse_to_markdown(file_path)
|
||
|
||
def parse(self, file_path: str) -> str:
|
||
"""Handle parse for the Parser Orchestrator instance."""
|
||
ext = os.path.splitext(file_path)[1].lower()
|
||
|
||
if ext == ".pdf":
|
||
return self.parse_pdf(file_path)
|
||
elif ext in [".docx", ".doc"]:
|
||
return self.parse_docx(file_path)
|
||
else:
|
||
raise ValueError(f"不支持的文件类型: {ext}")
|
||
|
||
|
||
def parse_with_mineru(file_path: str) -> MinerUResult:
|
||
"""Parse with mineru."""
|
||
parser = MinerUParser()
|
||
return parser.parse(file_path)
|
||
|
||
|
||
def parse_pdf_smart(file_path: str) -> str:
|
||
"""Parse pdf smart."""
|
||
orchestrator = ParserOrchestrator()
|
||
return orchestrator.parse_pdf(file_path)
|