Files
TERES_fastapi_backend/deepdoc/parser/ocr_http_client.py

176 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
OCR HTTP 客户端
用于调用独立的 OCR 服务的 HTTP API
"""
import os
import logging
import requests
from typing import Optional, Union, Dict, Any
logger = logging.getLogger(__name__)
class OCRHttpClient:
"""OCR HTTP 客户端,用于调用独立的 OCR 服务"""
def __init__(self, base_url: Optional[str] = None, timeout: int = 300):
"""
初始化 OCR HTTP 客户端
Args:
base_url: OCR 服务的基础 URL如果不提供则从环境变量 OCR_SERVICE_URL 读取
默认值为 http://localhost:8000
timeout: 请求超时时间(秒),默认 300 秒
"""
if base_url is None:
base_url = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
# 确保 URL 不包含尾随斜杠
self.base_url = base_url.rstrip("/")
self.timeout = timeout
self.api_prefix = "/api/v1/ocr"
logger.info(f"Initialized OCR HTTP client with base_url: {self.base_url}")
def parse_pdf_by_path(self, file_path: str, page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]:
"""
通过文件路径解析 PDF
Args:
file_path: PDF 文件的本地路径
page_from: 起始页码从1开始
page_to: 结束页码0表示最后一页
zoomin: 图像放大倍数1-5
Returns:
dict: 解析结果,格式:
{
"success": bool,
"message": str,
"data": {
"pages": [
{
"page_number": int,
"boxes": [
{
"text": str,
"bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
"confidence": float
},
...
]
},
...
]
}
}
Raises:
requests.RequestException: HTTP 请求失败
ValueError: 响应格式不正确
"""
url = f"{self.base_url}{self.api_prefix}/parse/path"
data = {
"file_path": file_path,
"page_from": page_from,
"page_to": page_to,
"zoomin": zoomin
}
try:
logger.info(f"Calling OCR service: {url} for file: {file_path}")
response = requests.post(url, data=data, timeout=self.timeout)
response.raise_for_status()
result = response.json()
if not result.get("success", False):
raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}")
return result
except requests.RequestException as e:
logger.error(f"Failed to call OCR service: {e}")
raise
def parse_pdf_by_bytes(self, pdf_bytes: bytes, filename: str = "document.pdf",
page_from: int = 1, page_to: int = 0, zoomin: int = 3) -> Dict[str, Any]:
"""
通过二进制数据解析 PDF
Args:
pdf_bytes: PDF 文件的二进制数据
filename: 文件名(仅用于日志)
page_from: 起始页码从1开始
page_to: 结束页码0表示最后一页
zoomin: 图像放大倍数1-5
Returns:
dict: 解析结果,格式同 parse_pdf_by_path
Raises:
requests.RequestException: HTTP 请求失败
ValueError: 响应格式不正确
"""
url = f"{self.base_url}{self.api_prefix}/parse/bytes"
files = {
"pdf_bytes": (filename, pdf_bytes, "application/pdf")
}
data = {
"filename": filename,
"page_from": page_from,
"page_to": page_to,
"zoomin": zoomin
}
try:
logger.info(f"Calling OCR service: {url} with {len(pdf_bytes)} bytes")
response = requests.post(url, files=files, data=data, timeout=self.timeout)
response.raise_for_status()
result = response.json()
if not result.get("success", False):
raise ValueError(f"OCR service returned error: {result.get('message', 'Unknown error')}")
return result
except requests.RequestException as e:
logger.error(f"Failed to call OCR service: {e}")
raise
def health_check(self) -> Dict[str, Any]:
"""
检查 OCR 服务健康状态
Returns:
dict: 健康状态信息
"""
url = f"{self.base_url}{self.api_prefix}/health"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
logger.error(f"Failed to check OCR service health: {e}")
raise