AIRegulation-DocAnalysis/backend/app/services/parser/mineru_parser.py

"""Provide service-layer logic for mineru parser."""

from typing import Optional, Dict
from dataclasses import dataclass, field
from loguru import logger
import os
# Keep service responsibilities explicit so downstream behavior stays predictable.


@dataclass
class MinerUResult:
    """Represent the Miner U Result type."""
    file_path: str
    markdown_text: str
    metadata: Dict[str, str] = field(default_factory=dict)
    success: bool = True
    error_message: str = ""


class MinerUParser:
    """Provide the Miner U Parser parser."""

    def __init__(self):
        """Initialize the Miner U Parser instance."""
        self.available = self._check_mineru_available()

    def _check_mineru_available(self) -> bool:
        """Handle check mineru available for this module for the Miner U Parser instance."""
        try:
            from magic_pdf.pipe.UNIPipe import UNIPipe
            return True
        except ImportError:
            logger.warning("MinerU (magic-pdf) 未安装，请运行: pip install magic-pdf[full]")
            return False

    def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
        """Handle parse for the Miner U Parser instance."""
        logger.info(f"尝试使用MinerU解析: {file_path}")

        if not self.available:
            return MinerUResult(
                file_path=file_path,
                markdown_text="",
                success=False,
                error_message="MinerU未安装"
            )

        try:
            from magic_pdf.pipe.UNIPipe import UNIPipe
            from magic_pdf.libs.MakeContentConfig import DropMode

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            if output_dir is None:
                output_dir = os.path.dirname(file_path)

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            pipe = UNIPipe(file_path, output_dir)

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            markdown_content = pipe.pipe_mk()

            logger.success(f"MinerU解析成功")

            return MinerUResult(
                file_path=file_path,
                markdown_text=markdown_content,
                metadata=self._extract_metadata(pipe),
                success=True
            )

        except Exception as e:
            logger.error(f"MinerU解析失败: {e}")
            return MinerUResult(
                file_path=file_path,
                markdown_text="",
                success=False,
                error_message=str(e)
            )

    def _extract_metadata(self, pipe) -> Dict[str, str]:
        """Handle extract metadata for this module for the Miner U Parser instance."""
        metadata = {}
        try:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
                mid_data = pipe.pdf_mid_data
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                metadata = {
                    "page_count": str(mid_data.get("page_count", "")),
                    "language": str(mid_data.get("language", "")),
                    "is_scanned": str(mid_data.get("is_scanned", "")),
                }
        except Exception as e:
            logger.warning(f"提取MinerU元数据失败: {e}")

        return metadata

    def parse_to_markdown(self, file_path: str) -> str:
        """Parse to markdown for the Miner U Parser instance."""
        result = self.parse(file_path)
        return result.markdown_text if result.success else ""


class ParserOrchestrator:
    """Represent the Parser Orchestrator type."""

    def __init__(self):
        """Initialize the Parser Orchestrator instance."""
        from .pdf_parser import PDFParser
        self.mineru_parser = MinerUParser()
        self.pdf_parser = PDFParser()
        self.mineru_available = self.mineru_parser.available

    def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
        """Parse pdf for the Parser Orchestrator instance."""
        markdown_text = ""

        if prefer_mineru and self.mineru_available:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            result = self.mineru_parser.parse(file_path)
            if result.success:
                markdown_text = result.markdown_text
                logger.info("使用MinerU解析成功")
                return markdown_text
            else:
                logger.warning(f"MinerU解析失败，回退到PyMuPDF: {result.error_message}")

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        logger.info("使用PyMuPDF基础解析")
        markdown_text = self.pdf_parser.parse_to_markdown(file_path)

        return markdown_text

    def parse_docx(self, file_path: str) -> str:
        """Parse docx for the Parser Orchestrator instance."""
        from .docx_parser import DocxParser
        docx_parser = DocxParser()
        return docx_parser.parse_to_markdown(file_path)

    def parse(self, file_path: str) -> str:
        """Handle parse for the Parser Orchestrator instance."""
        ext = os.path.splitext(file_path)[1].lower()

        if ext == ".pdf":
            return self.parse_pdf(file_path)
        elif ext in [".docx", ".doc"]:
            return self.parse_docx(file_path)
        else:
            raise ValueError(f"不支持的文件类型: {ext}")


def parse_with_mineru(file_path: str) -> MinerUResult:
    """Parse with mineru."""
    parser = MinerUParser()
    return parser.parse(file_path)


def parse_pdf_smart(file_path: str) -> str:
    """Parse pdf smart."""
    orchestrator = ParserOrchestrator()
    return orchestrator.parse_pdf(file_path)