Files
AIRegulation-DocAnalysis/backend/app/services/parser/mineru_parser.py

168 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Provide service-layer logic for mineru parser."""
from typing import Optional, Dict
from dataclasses import dataclass, field
from loguru import logger
import os
# Keep service responsibilities explicit so downstream behavior stays predictable.
@dataclass
class MinerUResult:
"""Represent the Miner U Result type."""
file_path: str
markdown_text: str
metadata: Dict[str, str] = field(default_factory=dict)
success: bool = True
error_message: str = ""
class MinerUParser:
"""Provide the Miner U Parser parser."""
def __init__(self):
"""Initialize the Miner U Parser instance."""
self.available = self._check_mineru_available()
def _check_mineru_available(self) -> bool:
"""Handle check mineru available for this module for the Miner U Parser instance."""
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
return True
except ImportError:
logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]")
return False
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
"""Handle parse for the Miner U Parser instance."""
logger.info(f"尝试使用MinerU解析: {file_path}")
if not self.available:
return MinerUResult(
file_path=file_path,
markdown_text="",
success=False,
error_message="MinerU未安装"
)
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.libs.MakeContentConfig import DropMode
# Keep service responsibilities explicit so downstream behavior stays predictable.
if output_dir is None:
output_dir = os.path.dirname(file_path)
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
pipe = UNIPipe(file_path, output_dir)
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
markdown_content = pipe.pipe_mk()
logger.success(f"MinerU解析成功")
return MinerUResult(
file_path=file_path,
markdown_text=markdown_content,
metadata=self._extract_metadata(pipe),
success=True
)
except Exception as e:
logger.error(f"MinerU解析失败: {e}")
return MinerUResult(
file_path=file_path,
markdown_text="",
success=False,
error_message=str(e)
)
def _extract_metadata(self, pipe) -> Dict[str, str]:
"""Handle extract metadata for this module for the Miner U Parser instance."""
metadata = {}
try:
# Keep service responsibilities explicit so downstream behavior stays predictable.
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
mid_data = pipe.pdf_mid_data
# Keep service responsibilities explicit so downstream behavior stays predictable.
metadata = {
"page_count": str(mid_data.get("page_count", "")),
"language": str(mid_data.get("language", "")),
"is_scanned": str(mid_data.get("is_scanned", "")),
}
except Exception as e:
logger.warning(f"提取MinerU元数据失败: {e}")
return metadata
def parse_to_markdown(self, file_path: str) -> str:
"""Parse to markdown for the Miner U Parser instance."""
result = self.parse(file_path)
return result.markdown_text if result.success else ""
class ParserOrchestrator:
"""Represent the Parser Orchestrator type."""
def __init__(self):
"""Initialize the Parser Orchestrator instance."""
from .pdf_parser import PDFParser
self.mineru_parser = MinerUParser()
self.pdf_parser = PDFParser()
self.mineru_available = self.mineru_parser.available
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
"""Parse pdf for the Parser Orchestrator instance."""
markdown_text = ""
if prefer_mineru and self.mineru_available:
# Keep service responsibilities explicit so downstream behavior stays predictable.
result = self.mineru_parser.parse(file_path)
if result.success:
markdown_text = result.markdown_text
logger.info("使用MinerU解析成功")
return markdown_text
else:
logger.warning(f"MinerU解析失败回退到PyMuPDF: {result.error_message}")
# Keep service responsibilities explicit so downstream behavior stays predictable.
logger.info("使用PyMuPDF基础解析")
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
return markdown_text
def parse_docx(self, file_path: str) -> str:
"""Parse docx for the Parser Orchestrator instance."""
from .docx_parser import DocxParser
docx_parser = DocxParser()
return docx_parser.parse_to_markdown(file_path)
def parse(self, file_path: str) -> str:
"""Handle parse for the Parser Orchestrator instance."""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return self.parse_pdf(file_path)
elif ext in [".docx", ".doc"]:
return self.parse_docx(file_path)
else:
raise ValueError(f"不支持的文件类型: {ext}")
def parse_with_mineru(file_path: str) -> MinerUResult:
"""Parse with mineru."""
parser = MinerUParser()
return parser.parse(file_path)
def parse_pdf_smart(file_path: str) -> str:
"""Parse pdf smart."""
orchestrator = ParserOrchestrator()
return orchestrator.parse_pdf(file_path)