# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ OCR HTTP 客户端 用于调用独立的 OCR 服务的 HTTP API """ import os import logging import requests from typing import Optional, Union, Dict, Any logger = logging.getLogger(__name__) class OCRHttpClient: """OCR HTTP 客户端,用于调用独立的 OCR 服务""" def __init__(self, base_url: Optional[str] = None, timeout: int = 300): """ 初始化 OCR HTTP 客户端 Args: base_url: OCR 服务的基础 URL,如果不提供则从环境变量 OCR_SERVICE_URL 读取 默认值为 http://localhost:8000 timeout: 请求超时时间(秒),默认 300 秒 """ if base_url is None: base_url = os.getenv("OCR_SERVICE_URL", "http://localhost:8000") # 确保 URL 不包含尾随斜杠 self.base_url = base_url.rstrip("/") self.timeout = timeout self.api_prefix = "/api/v1/ocr" logger.info(f"Initialized OCR HTTP client with base_url: {self.base_url}") def parse_pdf_by_path(self, file_path: str, page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]: """ 通过文件路径解析 PDF Args: file_path: PDF 文件的本地路径 page_from: 起始页码(从1开始) page_to: 结束页码(0表示最后一页) zoomin: 图像放大倍数(1-5) Returns: dict: 解析结果,格式: { "success": bool, "message": str, "data": { "pages": [ { "page_number": int, "boxes": [ { "text": str, "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], "confidence": float }, ... ] }, ... ] } } Raises: requests.RequestException: HTTP 请求失败 ValueError: 响应格式不正确 """ url = f"{self.base_url}{self.api_prefix}/parse/path" data = { "file_path": file_path, "page_from": page_from, "page_to": page_to, "zoomin": zoomin } try: logger.info(f"Calling OCR service: {url} for file: {file_path}") response = requests.post(url, data=data, timeout=self.timeout) response.raise_for_status() result = response.json() if not result.get("success", False): raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}") return result except requests.RequestException as e: logger.error(f"Failed to call OCR service: {e}") raise def parse_pdf_by_bytes(self, pdf_bytes: bytes, filename: str = "document.pdf", page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]: """ 通过二进制数据解析 PDF Args: pdf_bytes: PDF 文件的二进制数据 filename: 文件名(仅用于日志) page_from: 起始页码(从1开始) page_to: 结束页码(0表示最后一页) zoomin: 图像放大倍数(1-5) Returns: dict: 解析结果,格式同 parse_pdf_by_path Raises: requests.RequestException: HTTP 请求失败 ValueError: 响应格式不正确 """ url = f"{self.base_url}{self.api_prefix}/parse/bytes" files = { "pdf_bytes": (filename, pdf_bytes, "application/pdf") } data = { "filename": filename, "page_from": page_from, "page_to": page_to, "zoomin": zoomin } try: logger.info(f"Calling OCR service: {url} with {len(pdf_bytes)} bytes") response = requests.post(url, files=files, data=data, timeout=self.timeout) response.raise_for_status() result = response.json() if not result.get("success", False): raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}") return result except requests.RequestException as e: logger.error(f"Failed to call OCR service: {e}") raise def health_check(self) -> Dict[str, Any]: """ 检查 OCR 服务健康状态 Returns: dict: 健康状态信息 """ url = f"{self.base_url}{self.api_prefix}/health" try: response = requests.get(url, timeout=10) response.raise_for_status() return response.json() except requests.RequestException as e: logger.error(f"Failed to check OCR service health: {e}") raise