将ocr解析模块独立出来

2025-10-31 14:38:37 +08:00
parent d78f1fe91d
commit 4318179904
13 changed files with 3262 additions and 23 deletions
--- a/deepdoc/parser/ocr_http_client.py
+++ b/deepdoc/parser/ocr_http_client.py
@@ -0,0 +1,175 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+OCR HTTP 客户端
+用于调用独立的 OCR 服务的 HTTP API
+"""
+
+import os
+import logging
+import requests
+from typing import Optional, Union, Dict, Any
+
+logger = logging.getLogger(__name__)
+
+
+class OCRHttpClient:
+    """OCR HTTP 客户端，用于调用独立的 OCR 服务"""
+    
+    def __init__(self, base_url: Optional[str] = None, timeout: int = 300):
+        """
+        初始化 OCR HTTP 客户端
+        
+        Args:
+            base_url: OCR 服务的基础 URL，如果不提供则从环境变量 OCR_SERVICE_URL 读取
+                     默认值为 http://localhost:8000
+            timeout: 请求超时时间（秒），默认 300 秒
+        """
+        if base_url is None:
+            base_url = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
+        
+        # 确保 URL 不包含尾随斜杠
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+        self.api_prefix = "/api/v1/ocr"
+        
+        logger.info(f"Initialized OCR HTTP client with base_url: {self.base_url}")
+    
+    def parse_pdf_by_path(self, file_path: str, page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]:
+        """
+        通过文件路径解析 PDF
+        
+        Args:
+            file_path: PDF 文件的本地路径
+            page_from: 起始页码（从1开始）
+            page_to: 结束页码（0表示最后一页）
+            zoomin: 图像放大倍数（1-5）
+            
+        Returns:
+            dict: 解析结果，格式：
+                {
+                    "success": bool,
+                    "message": str,
+                    "data": {
+                        "pages": [
+                            {
+                                "page_number": int,
+                                "boxes": [
+                                    {
+                                        "text": str,
+                                        "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
+                                        "confidence": float
+                                    },
+                                    ...
+                                ]
+                            },
+                            ...
+                        ]
+                    }
+                }
+            
+        Raises:
+            requests.RequestException: HTTP 请求失败
+            ValueError: 响应格式不正确
+        """
+        url = f"{self.base_url}{self.api_prefix}/parse/path"
+        
+        data = {
+            "file_path": file_path,
+            "page_from": page_from,
+            "page_to": page_to,
+            "zoomin": zoomin
+        }
+        
+        try:
+            logger.info(f"Calling OCR service: {url} for file: {file_path}")
+            response = requests.post(url, data=data, timeout=self.timeout)
+            response.raise_for_status()
+            
+            result = response.json()
+            if not result.get("success", False):
+                raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}")
+            
+            return result
+            
+        except requests.RequestException as e:
+            logger.error(f"Failed to call OCR service: {e}")
+            raise
+    
+    def parse_pdf_by_bytes(self, pdf_bytes: bytes, filename: str = "document.pdf", 
+                          page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]:
+        """
+        通过二进制数据解析 PDF
+        
+        Args:
+            pdf_bytes: PDF 文件的二进制数据
+            filename: 文件名（仅用于日志）
+            page_from: 起始页码（从1开始）
+            page_to: 结束页码（0表示最后一页）
+            zoomin: 图像放大倍数（1-5）
+            
+        Returns:
+            dict: 解析结果，格式同 parse_pdf_by_path
+            
+        Raises:
+            requests.RequestException: HTTP 请求失败
+            ValueError: 响应格式不正确
+        """
+        url = f"{self.base_url}{self.api_prefix}/parse/bytes"
+        
+        files = {
+            "pdf_bytes": (filename, pdf_bytes, "application/pdf")
+        }
+        
+        data = {
+            "filename": filename,
+            "page_from": page_from,
+            "page_to": page_to,
+            "zoomin": zoomin
+        }
+        
+        try:
+            logger.info(f"Calling OCR service: {url} with {len(pdf_bytes)} bytes")
+            response = requests.post(url, files=files, data=data, timeout=self.timeout)
+            response.raise_for_status()
+            
+            result = response.json()
+            if not result.get("success", False):
+                raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}")
+            
+            return result
+            
+        except requests.RequestException as e:
+            logger.error(f"Failed to call OCR service: {e}")
+            raise
+    
+    def health_check(self) -> Dict[str, Any]:
+        """
+        检查 OCR 服务健康状态
+        
+        Returns:
+            dict: 健康状态信息
+        """
+        url = f"{self.base_url}{self.api_prefix}/health"
+        
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            return response.json()
+        except requests.RequestException as e:
+            logger.error(f"Failed to check OCR service health: {e}")
+            raise
+