"""Provide service-layer logic for mineru parser.""" from typing import Optional, Dict from dataclasses import dataclass, field from loguru import logger import os # Keep service responsibilities explicit so downstream behavior stays predictable. @dataclass class MinerUResult: """Represent the Miner U Result type.""" file_path: str markdown_text: str metadata: Dict[str, str] = field(default_factory=dict) success: bool = True error_message: str = "" class MinerUParser: """Provide the Miner U Parser parser.""" def __init__(self): """Initialize the Miner U Parser instance.""" self.available = self._check_mineru_available() def _check_mineru_available(self) -> bool: """Handle check mineru available for this module for the Miner U Parser instance.""" try: from magic_pdf.pipe.UNIPipe import UNIPipe return True except ImportError: logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]") return False def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult: """Handle parse for the Miner U Parser instance.""" logger.info(f"尝试使用MinerU解析: {file_path}") if not self.available: return MinerUResult( file_path=file_path, markdown_text="", success=False, error_message="MinerU未安装" ) try: from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.libs.MakeContentConfig import DropMode # Keep service responsibilities explicit so downstream behavior stays predictable. if output_dir is None: output_dir = os.path.dirname(file_path) # Keep service responsibilities explicit so downstream behavior stays predictable. # Keep service responsibilities explicit so downstream behavior stays predictable. # Keep service responsibilities explicit so downstream behavior stays predictable. # Keep service responsibilities explicit so downstream behavior stays predictable. # Keep service responsibilities explicit so downstream behavior stays predictable. pipe = UNIPipe(file_path, output_dir) # Keep service responsibilities explicit so downstream behavior stays predictable. # Keep service responsibilities explicit so downstream behavior stays predictable. markdown_content = pipe.pipe_mk() logger.success(f"MinerU解析成功") return MinerUResult( file_path=file_path, markdown_text=markdown_content, metadata=self._extract_metadata(pipe), success=True ) except Exception as e: logger.error(f"MinerU解析失败: {e}") return MinerUResult( file_path=file_path, markdown_text="", success=False, error_message=str(e) ) def _extract_metadata(self, pipe) -> Dict[str, str]: """Handle extract metadata for this module for the Miner U Parser instance.""" metadata = {} try: # Keep service responsibilities explicit so downstream behavior stays predictable. if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data: mid_data = pipe.pdf_mid_data # Keep service responsibilities explicit so downstream behavior stays predictable. metadata = { "page_count": str(mid_data.get("page_count", "")), "language": str(mid_data.get("language", "")), "is_scanned": str(mid_data.get("is_scanned", "")), } except Exception as e: logger.warning(f"提取MinerU元数据失败: {e}") return metadata def parse_to_markdown(self, file_path: str) -> str: """Parse to markdown for the Miner U Parser instance.""" result = self.parse(file_path) return result.markdown_text if result.success else "" class ParserOrchestrator: """Represent the Parser Orchestrator type.""" def __init__(self): """Initialize the Parser Orchestrator instance.""" from .pdf_parser import PDFParser self.mineru_parser = MinerUParser() self.pdf_parser = PDFParser() self.mineru_available = self.mineru_parser.available def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str: """Parse pdf for the Parser Orchestrator instance.""" markdown_text = "" if prefer_mineru and self.mineru_available: # Keep service responsibilities explicit so downstream behavior stays predictable. result = self.mineru_parser.parse(file_path) if result.success: markdown_text = result.markdown_text logger.info("使用MinerU解析成功") return markdown_text else: logger.warning(f"MinerU解析失败,回退到PyMuPDF: {result.error_message}") # Keep service responsibilities explicit so downstream behavior stays predictable. logger.info("使用PyMuPDF基础解析") markdown_text = self.pdf_parser.parse_to_markdown(file_path) return markdown_text def parse_docx(self, file_path: str) -> str: """Parse docx for the Parser Orchestrator instance.""" from .docx_parser import DocxParser docx_parser = DocxParser() return docx_parser.parse_to_markdown(file_path) def parse(self, file_path: str) -> str: """Handle parse for the Parser Orchestrator instance.""" ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": return self.parse_pdf(file_path) elif ext in [".docx", ".doc"]: return self.parse_docx(file_path) else: raise ValueError(f"不支持的文件类型: {ext}") def parse_with_mineru(file_path: str) -> MinerUResult: """Parse with mineru.""" parser = MinerUParser() return parser.parse(file_path) def parse_pdf_smart(file_path: str) -> str: """Parse pdf smart.""" orchestrator = ParserOrchestrator() return orchestrator.parse_pdf(file_path)