176 lines
6.0 KiB
Python
176 lines
6.0 KiB
Python
|
|
#
|
|||
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|||
|
|
#
|
|||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|||
|
|
# you may not use this file except in compliance with the License.
|
|||
|
|
# You may obtain a copy of the License at
|
|||
|
|
#
|
|||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|||
|
|
#
|
|||
|
|
# Unless required by applicable law or agreed to in writing, software
|
|||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
|
|
# See the License for the specific language governing permissions and
|
|||
|
|
# limitations under the License.
|
|||
|
|
#
|
|||
|
|
"""
|
|||
|
|
OCR HTTP 客户端
|
|||
|
|
用于调用独立的 OCR 服务的 HTTP API
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import logging
|
|||
|
|
import requests
|
|||
|
|
from typing import Optional, Union, Dict, Any
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class OCRHttpClient:
|
|||
|
|
"""OCR HTTP 客户端,用于调用独立的 OCR 服务"""
|
|||
|
|
|
|||
|
|
def __init__(self, base_url: Optional[str] = None, timeout: int = 300):
|
|||
|
|
"""
|
|||
|
|
初始化 OCR HTTP 客户端
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
base_url: OCR 服务的基础 URL,如果不提供则从环境变量 OCR_SERVICE_URL 读取
|
|||
|
|
默认值为 http://localhost:8000
|
|||
|
|
timeout: 请求超时时间(秒),默认 300 秒
|
|||
|
|
"""
|
|||
|
|
if base_url is None:
|
|||
|
|
base_url = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
|
|||
|
|
|
|||
|
|
# 确保 URL 不包含尾随斜杠
|
|||
|
|
self.base_url = base_url.rstrip("/")
|
|||
|
|
self.timeout = timeout
|
|||
|
|
self.api_prefix = "/api/v1/ocr"
|
|||
|
|
|
|||
|
|
logger.info(f"Initialized OCR HTTP client with base_url: {self.base_url}")
|
|||
|
|
|
|||
|
|
def parse_pdf_by_path(self, file_path: str, page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
通过文件路径解析 PDF
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: PDF 文件的本地路径
|
|||
|
|
page_from: 起始页码(从1开始)
|
|||
|
|
page_to: 结束页码(0表示最后一页)
|
|||
|
|
zoomin: 图像放大倍数(1-5)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 解析结果,格式:
|
|||
|
|
{
|
|||
|
|
"success": bool,
|
|||
|
|
"message": str,
|
|||
|
|
"data": {
|
|||
|
|
"pages": [
|
|||
|
|
{
|
|||
|
|
"page_number": int,
|
|||
|
|
"boxes": [
|
|||
|
|
{
|
|||
|
|
"text": str,
|
|||
|
|
"bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
|
|||
|
|
"confidence": float
|
|||
|
|
},
|
|||
|
|
...
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
...
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
requests.RequestException: HTTP 请求失败
|
|||
|
|
ValueError: 响应格式不正确
|
|||
|
|
"""
|
|||
|
|
url = f"{self.base_url}{self.api_prefix}/parse/path"
|
|||
|
|
|
|||
|
|
data = {
|
|||
|
|
"file_path": file_path,
|
|||
|
|
"page_from": page_from,
|
|||
|
|
"page_to": page_to,
|
|||
|
|
"zoomin": zoomin
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
logger.info(f"Calling OCR service: {url} for file: {file_path}")
|
|||
|
|
response = requests.post(url, data=data, timeout=self.timeout)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
result = response.json()
|
|||
|
|
if not result.get("success", False):
|
|||
|
|
raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}")
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
except requests.RequestException as e:
|
|||
|
|
logger.error(f"Failed to call OCR service: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def parse_pdf_by_bytes(self, pdf_bytes: bytes, filename: str = "document.pdf",
|
|||
|
|
page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
通过二进制数据解析 PDF
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
pdf_bytes: PDF 文件的二进制数据
|
|||
|
|
filename: 文件名(仅用于日志)
|
|||
|
|
page_from: 起始页码(从1开始)
|
|||
|
|
page_to: 结束页码(0表示最后一页)
|
|||
|
|
zoomin: 图像放大倍数(1-5)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 解析结果,格式同 parse_pdf_by_path
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
requests.RequestException: HTTP 请求失败
|
|||
|
|
ValueError: 响应格式不正确
|
|||
|
|
"""
|
|||
|
|
url = f"{self.base_url}{self.api_prefix}/parse/bytes"
|
|||
|
|
|
|||
|
|
files = {
|
|||
|
|
"pdf_bytes": (filename, pdf_bytes, "application/pdf")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
data = {
|
|||
|
|
"filename": filename,
|
|||
|
|
"page_from": page_from,
|
|||
|
|
"page_to": page_to,
|
|||
|
|
"zoomin": zoomin
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
logger.info(f"Calling OCR service: {url} with {len(pdf_bytes)} bytes")
|
|||
|
|
response = requests.post(url, files=files, data=data, timeout=self.timeout)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
result = response.json()
|
|||
|
|
if not result.get("success", False):
|
|||
|
|
raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}")
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
except requests.RequestException as e:
|
|||
|
|
logger.error(f"Failed to call OCR service: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def health_check(self) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
检查 OCR 服务健康状态
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
dict: 健康状态信息
|
|||
|
|
"""
|
|||
|
|
url = f"{self.base_url}{self.api_prefix}/health"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
response = requests.get(url, timeout=10)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
return response.json()
|
|||
|
|
except requests.RequestException as e:
|
|||
|
|
logger.error(f"Failed to check OCR service health: {e}")
|
|||
|
|
raise
|
|||
|
|
|