Fix SSE route dependency and align architecture docs

2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions
--- a/backend/app/services/parser/mineru_parser.py
+++ b/backend/app/services/parser/mineru_parser.py
@@ -1,14 +1,16 @@
-"""MinerU多模态PDF解析 - 版面感知解析"""
+"""Provide service-layer logic for mineru parser."""

 from typing import Optional, Dict
 from dataclasses import dataclass, field
 from loguru import logger
 import os
+# Keep service responsibilities explicit so downstream behavior stays predictable.
+


@dataclass
 class MinerUResult:
-    """MinerU解析结果"""
+    """Represent the Miner U Result type."""
    file_path: str
    markdown_text: str
    metadata: Dict[str, str] = field(default_factory=dict)
@@ -17,21 +19,14 @@ class MinerUResult:


 class MinerUParser:
-    """
-    MinerU多模态PDF解析器
-
-    MinerU (magic-pdf) 是一个开源的高质量PDF解析工具，
-    支持版面感知解析，能够识别文档中的标题、正文、表格、图片等元素，
-    并输出结构化的Markdown格式。
-
-    GitHub: https://github.com/opendatalab/MinerU
-    """
+    """Provide the Miner U Parser parser."""

    def __init__(self):
+        """Initialize the Miner U Parser instance."""
        self.available = self._check_mineru_available()

    def _check_mineru_available(self) -> bool:
-        """检查MinerU是否可用"""
+        """Handle check mineru available for this module for the Miner U Parser instance."""
        try:
            from magic_pdf.pipe.UNIPipe import UNIPipe
            return True
@@ -40,16 +35,7 @@ class MinerUParser:
            return False

    def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
-        """
-        使用MinerU解析PDF文档
-
-        Args:
-            file_path: PDF文件路径
-            output_dir: 输出目录（可选，用于保存解析产物）
-
-        Returns:
-            MinerUResult: 解析结果
-        """
+        """Handle parse for the Miner U Parser instance."""
        logger.info(f"尝试使用MinerU解析: {file_path}")

        if not self.available:
@@ -64,19 +50,19 @@ class MinerUParser:
            from magic_pdf.pipe.UNIPipe import UNIPipe
            from magic_pdf.libs.MakeContentConfig import DropMode

-            # 设置输出目录
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
            if output_dir is None:
                output_dir = os.path.dirname(file_path)

-            # 创建解析管道
-            # OCR模式可以根据PDF类型选择
-            # auto: 自动判断是否需要OCR
-            # txt: 纯文本PDF（无OCR）
-            # ocr: 扫描件PDF（OCR）
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
            pipe = UNIPipe(file_path, output_dir)

-            # 执行解析
-            # pipe_mk() 返回Markdown格式文本
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
            markdown_content = pipe.pipe_mk()

            logger.success(f"MinerU解析成功")
@@ -98,13 +84,13 @@ class MinerUParser:
            )

    def _extract_metadata(self, pipe) -> Dict[str, str]:
-        """从解析管道提取元数据"""
+        """Handle extract metadata for this module for the Miner U Parser instance."""
        metadata = {}
        try:
-            # MinerU解析管道中可能包含的元数据信息
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
            if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
                mid_data = pipe.pdf_mid_data
-                # 提取可能的元数据字段
+                # Keep service responsibilities explicit so downstream behavior stays predictable.
                metadata = {
                    "page_count": str(mid_data.get("page_count", "")),
                    "language": str(mid_data.get("language", "")),
@@ -116,41 +102,27 @@ class MinerUParser:
        return metadata

    def parse_to_markdown(self, file_path: str) -> str:
-        """直接解析并返回Markdown文本"""
+        """Parse to markdown for the Miner U Parser instance."""
        result = self.parse(file_path)
        return result.markdown_text if result.success else ""


 class ParserOrchestrator:
-    """
-    解析服务编排 - 按优先级选择解析器
-
-    解析策略：
-    1. 优先尝试MinerU（版面感知能力强）
-    2. MinerU失败时回退到基础PyMuPDF解析
-    """
+    """Represent the Parser Orchestrator type."""

    def __init__(self):
+        """Initialize the Parser Orchestrator instance."""
        from .pdf_parser import PDFParser
        self.mineru_parser = MinerUParser()
        self.pdf_parser = PDFParser()
        self.mineru_available = self.mineru_parser.available

    def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
-        """
-        解析PDF文档，按优先级选择解析器
-
-        Args:
-            file_path: PDF文件路径
-            prefer_mineru: 是否优先使用MinerU
-
-        Returns:
-            str: Markdown格式文本
-        """
+        """Parse pdf for the Parser Orchestrator instance."""
        markdown_text = ""

        if prefer_mineru and self.mineru_available:
-            # 优先尝试MinerU
+            # Keep service responsibilities explicit so downstream behavior stays predictable.
            result = self.mineru_parser.parse(file_path)
            if result.success:
                markdown_text = result.markdown_text
@@ -159,28 +131,20 @@ class ParserOrchestrator:
            else:
                logger.warning(f"MinerU解析失败，回退到PyMuPDF: {result.error_message}")

-        # 回退到PyMuPDF基础解析
+        # Keep service responsibilities explicit so downstream behavior stays predictable.
        logger.info("使用PyMuPDF基础解析")
        markdown_text = self.pdf_parser.parse_to_markdown(file_path)

        return markdown_text

    def parse_docx(self, file_path: str) -> str:
-        """解析Word文档"""
+        """Parse docx for the Parser Orchestrator instance."""
        from .docx_parser import DocxParser
        docx_parser = DocxParser()
        return docx_parser.parse_to_markdown(file_path)

    def parse(self, file_path: str) -> str:
-        """
-        根据文件类型选择解析器
-
-        Args:
-            file_path: 文件路径
-
-        Returns:
-            str: Markdown格式文本
-        """
+        """Handle parse for the Parser Orchestrator instance."""
        ext = os.path.splitext(file_path)[1].lower()

        if ext == ".pdf":
@@ -192,12 +156,12 @@ class ParserOrchestrator:


 def parse_with_mineru(file_path: str) -> MinerUResult:
-    """便捷函数：使用MinerU解析"""
+    """Parse with mineru."""
    parser = MinerUParser()
    return parser.parse(file_path)


 def parse_pdf_smart(file_path: str) -> str:
-    """便捷函数：智能解析PDF（自动选择最佳解析器）"""
+    """Parse pdf smart."""
    orchestrator = ParserOrchestrator()
    return orchestrator.parse_pdf(file_path)