TERES_fastapi_backend/ocr/api.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""
OCR PDF处理的FastAPI路由
提供HTTP接口用于PDF的OCR识别
"""

import asyncio
import logging
import os
import sys
import tempfile
from pathlib import Path
from typing import Optional

from fastapi import APIRouter, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel

# 处理导入问题：支持直接运行和模块导入

try:
    _package = __package__
except NameError:
    _package = None

if _package is None:
    # 直接运行时，添加父目录到路径并使用绝对导入
    parent_dir = Path(__file__).parent.parent
    if str(parent_dir) not in sys.path:
        sys.path.insert(0, str(parent_dir))
    from ocr.pdf_parser import SimplePdfParser
    from ocr.config import MODEL_DIR
else:
    # 作为模块导入时使用相对导入
    from pdf_parser import SimplePdfParser
    from config import MODEL_DIR

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"])

# 全局解析器实例（懒加载）
_parser_instance: Optional[SimplePdfParser] = None


def get_parser() -> SimplePdfParser:
    """获取全局解析器实例（单例模式）"""
    global _parser_instance
    if _parser_instance is None:
        logger.info(f"Initializing OCR parser with model_dir={MODEL_DIR}")
        _parser_instance = SimplePdfParser(model_dir=MODEL_DIR)
    return _parser_instance


class ParseResponse(BaseModel):
    """解析响应模型"""
    success: bool
    message: str
    data: Optional[dict] = None


@router.get(
    "/health",
    summary="健康检查",
    description="检查OCR服务的健康状态和配置信息",
    response_description="返回服务状态和模型目录信息"
)
async def health_check():
    """
    健康检查端点
    
    用于检查OCR服务的运行状态和配置信息。
    
    Returns:
        dict: 包含服务状态和模型目录的信息
    """
    return {
        "status": "healthy",
        "service": "OCR PDF Parser",
        "model_dir": MODEL_DIR
    }


@router.post(
    "/parse",
    response_model=ParseResponse,
    summary="上传并解析PDF文件",
    description="上传PDF文件并通过OCR识别提取文本内容",
    response_description="返回OCR识别结果"
)
async def parse_pdf_endpoint(
    file: UploadFile = File(..., description="PDF文件，支持上传任意PDF文档"),
    page_from: int = Form(1, ge=1, description="起始页码（从1开始，默认为1）"),
    page_to: int = Form(0, ge=0, description="结束页码（0表示解析到最后一页，默认为0）"),
    zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数（1-5，数值越大识别精度越高但速度越慢，默认为3）")
):
    """
    上传并解析PDF文件
    
    通过上传PDF文件，使用OCR技术识别并提取其中的文本内容。
    支持指定解析的页码范围，以及调整图像放大倍数以平衡识别精度和速度。
    
    Args:
        file: 上传的PDF文件（multipart/form-data格式）
        page_from: 起始页码（从1开始，最小值为1）
        page_to: 结束页码（0表示解析到最后一页，最小值为0）
        zoomin: 图像放大倍数（1-5之间，数值越大识别精度越高但处理速度越慢）
    
    Returns:
        ParseResponse: 包含解析结果的响应对象，包括：
            - success: 是否成功
            - message: 操作结果消息
            - data: OCR识别的文本内容和元数据
    
    Raises:
        HTTPException: 400 - 如果文件不是PDF格式或文件为空
        HTTPException: 500 - 如果解析过程中发生错误
    """
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="只支持PDF文件")
    
    # 保存上传的文件到临时目录
    temp_file = None
    try:
        # 读取文件内容
        content = await file.read()
        if not content:
            raise HTTPException(status_code=400, detail="文件为空")
        
        # 创建临时文件
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
            tmp.write(content)
            temp_file = tmp.name
        
        logger.info(f"Parsing PDF file: {file.filename}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
        
        # 解析PDF（parse_pdf是同步方法，使用to_thread在线程池中执行）
        parser = get_parser()
        result = await asyncio.to_thread(
            parser.parse_pdf,
            temp_file,
            zoomin,
            page_from - 1,  # 转换为从0开始的索引
            (page_to - 1) if page_to > 0 else 299,  # 转换为从0开始的索引
            None  # callback
        )
        
        return ParseResponse(
            success=True,
            message=f"成功解析PDF: {file.filename}",
            data=result
        )
    
    except Exception as e:
        logger.error(f"Error parsing PDF: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"解析PDF时发生错误: {str(e)}"
        )
    
    finally:
        # 清理临时文件
        if temp_file and os.path.exists(temp_file):
            try:
                os.unlink(temp_file)
            except Exception as e:
                logger.warning(f"Failed to delete temp file {temp_file}: {e}")


@router.post(
    "/parse/bytes",
    response_model=ParseResponse,
    summary="通过二进制数据解析PDF",
    description="直接通过二进制数据解析PDF文件，无需上传文件",
    response_description="返回OCR识别结果"
)
async def parse_pdf_bytes(
    pdf_bytes: bytes = File(..., description="PDF文件的二进制数据（multipart/form-data格式）"),
    filename: str = Form("document.pdf", description="文件名（仅用于日志记录，不影响解析）"),
    page_from: int = Form(1, ge=1, description="起始页码（从1开始，默认为1）"),
    page_to: int = Form(0, ge=0, description="结束页码（0表示解析到最后一页，默认为0）"),
    zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数（1-5，数值越大识别精度越高但速度越慢，默认为3）")
):
    """
    直接通过二进制数据解析PDF
    
    适用于已获取PDF二进制数据的场景，无需文件上传步骤。
    直接将PDF的二进制数据提交即可进行OCR识别。
    
    Args:
        pdf_bytes: PDF文件的二进制数据（以文件形式提交）
        filename: 文件名（仅用于日志记录，不影响实际解析过程）
        page_from: 起始页码（从1开始，最小值为1）
        page_to: 结束页码（0表示解析到最后一页，最小值为0）
        zoomin: 图像放大倍数（1-5之间，数值越大识别精度越高但处理速度越慢）
    
    Returns:
        ParseResponse: 包含解析结果的响应对象
    
    Raises:
        HTTPException: 400 - 如果PDF数据为空
        HTTPException: 500 - 如果解析过程中发生错误
    """
    if not pdf_bytes:
        raise HTTPException(status_code=400, detail="PDF数据为空")
    
    # 保存到临时文件
    temp_file = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
            tmp.write(pdf_bytes)
            temp_file = tmp.name
        
        logger.info(f"Parsing PDF bytes (filename: {filename}), pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
        
        # 解析PDF（parse_pdf是同步方法，使用to_thread在线程池中执行）
        parser = get_parser()
        result = await asyncio.to_thread(
            parser.parse_pdf,
            temp_file,
            zoomin,
            page_from - 1,  # 转换为从0开始的索引
            (page_to - 1) if page_to > 0 else 299,  # 转换为从0开始的索引
            None  # callback
        )
        
        return ParseResponse(
            success=True,
            message=f"成功解析PDF: {filename}",
            data=result
        )
    
    except Exception as e:
        logger.error(f"Error parsing PDF bytes: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"解析PDF时发生错误: {str(e)}"
        )
    
    finally:
        # 清理临时文件
        if temp_file and os.path.exists(temp_file):
            try:
                os.unlink(temp_file)
            except Exception as e:
                logger.warning(f"Failed to delete temp file {temp_file}: {e}")


@router.post(
    "/parse/path",
    response_model=ParseResponse,
    summary="通过文件路径解析PDF",
    description="通过服务器本地文件路径解析PDF文件",
    response_description="返回OCR识别结果"
)
async def parse_pdf_path(
    file_path: str = Form(..., description="PDF文件在服务器上的本地路径（必须是可访问的绝对路径）"),
    page_from: int = Form(1, ge=1, description="起始页码（从1开始，默认为1）"),
    page_to: int = Form(0, ge=0, description="结束页码（0表示解析到最后一页，默认为0）"),
    zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数（1-5，数值越大识别精度越高但速度越慢，默认为3）")
):
    """
    通过文件路径解析PDF
    
    适用于PDF文件已经存在于服务器上的场景。
    通过提供文件路径直接进行OCR识别，无需上传文件。
    
    Args:
        file_path: PDF文件在服务器上的本地路径（必须是服务器可访问的绝对路径）
        page_from: 起始页码（从1开始，最小值为1）
        page_to: 结束页码（0表示解析到最后一页，最小值为0）
        zoomin: 图像放大倍数（1-5之间，数值越大识别精度越高但处理速度越慢）
    
    Returns:
        ParseResponse: 包含解析结果的响应对象
    
    Raises:
        HTTPException: 400 - 如果文件不是PDF格式
        HTTPException: 404 - 如果文件不存在
        HTTPException: 500 - 如果解析过程中发生错误
    
    Note:
        此端点需要确保提供的文件路径在服务器上可访问。
        建议仅在内网环境或受信任的环境中使用，避免路径遍历安全风险。
    """
    if not os.path.exists(file_path):
        raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
    
    if not file_path.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="只支持PDF文件")
    
    try:
        logger.info(f"Parsing PDF from path: {file_path}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
        
        # 解析PDF（parse_pdf是同步方法，使用to_thread在线程池中执行）
        parser = get_parser()
        result = await asyncio.to_thread(
            parser.parse_pdf,
            file_path,
            zoomin,
            page_from - 1,  # 转换为从0开始的索引
            (page_to - 1) if page_to > 0 else 299,  # 转换为从0开始的索引
            None  # callback
        )
        
        return ParseResponse(
            success=True,
            message=f"成功解析PDF: {file_path}",
            data=result
        )
    
    except Exception as e:
        logger.error(f"Error parsing PDF from path: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"解析PDF时发生错误: {str(e)}"
        )