将ocr解析模块独立出来
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -32,22 +32,6 @@
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
try:
|
|
||||||
_package = __package__
|
|
||||||
except NameError:
|
|
||||||
_package = None
|
|
||||||
|
|
||||||
if _package is None:
|
|
||||||
# 直接运行时,添加父目录到路径并使用绝对导入
|
|
||||||
parent_dir = Path(__file__).parent.parent
|
|
||||||
if str(parent_dir) not in sys.path:
|
|
||||||
sys.path.insert(0, str(parent_dir))
|
|
||||||
from ocr.ocr import OCR, TextDetector, TextRecognizer
|
|
||||||
from ocr.pdf_parser import SimplePdfParser
|
|
||||||
else:
|
|
||||||
# 作为模块导入时使用相对导入
|
|
||||||
from .ocr import OCR, TextDetector, TextRecognizer
|
|
||||||
from .pdf_parser import SimplePdfParser
|
|
||||||
|
|
||||||
__all__ = ['OCR', 'TextDetector', 'TextRecognizer', 'SimplePdfParser']
|
__all__ = ['OCR', 'TextDetector', 'TextRecognizer', 'SimplePdfParser']
|
||||||
|
|
||||||
|
|||||||
857
ocr/api.py
857
ocr/api.py
@@ -1,332 +1,525 @@
|
|||||||
#
|
#
|
||||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
# You may obtain a copy of the License at
|
# You may obtain a copy of the License at
|
||||||
#
|
#
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
#
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
"""
|
"""
|
||||||
OCR PDF处理的FastAPI路由
|
OCR PDF处理的FastAPI路由
|
||||||
提供HTTP接口用于PDF的OCR识别
|
提供HTTP接口用于PDF的OCR识别
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
# 处理导入问题:支持直接运行和模块导入
|
from ocr import SimplePdfParser
|
||||||
|
from ocr.config import MODEL_DIR
|
||||||
try:
|
|
||||||
_package = __package__
|
logger = logging.getLogger(__name__)
|
||||||
except NameError:
|
|
||||||
_package = None
|
ocr_router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"])
|
||||||
|
|
||||||
if _package is None:
|
# 全局解析器实例(懒加载)
|
||||||
# 直接运行时,添加父目录到路径并使用绝对导入
|
_parser_instance: Optional[SimplePdfParser] = None
|
||||||
parent_dir = Path(__file__).parent.parent
|
|
||||||
if str(parent_dir) not in sys.path:
|
|
||||||
sys.path.insert(0, str(parent_dir))
|
def get_parser() -> SimplePdfParser:
|
||||||
from ocr.pdf_parser import SimplePdfParser
|
"""获取全局解析器实例(单例模式)"""
|
||||||
from ocr.config import MODEL_DIR
|
global _parser_instance
|
||||||
else:
|
if _parser_instance is None:
|
||||||
# 作为模块导入时使用相对导入
|
logger.info(f"Initializing OCR parser with model_dir={MODEL_DIR}")
|
||||||
from pdf_parser import SimplePdfParser
|
_parser_instance = SimplePdfParser(model_dir=MODEL_DIR)
|
||||||
from config import MODEL_DIR
|
return _parser_instance
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
class ParseResponse(BaseModel):
|
||||||
router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"])
|
"""解析响应模型"""
|
||||||
|
success: bool
|
||||||
# 全局解析器实例(懒加载)
|
message: str
|
||||||
_parser_instance: Optional[SimplePdfParser] = None
|
data: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
def get_parser() -> SimplePdfParser:
|
@router.get(
|
||||||
"""获取全局解析器实例(单例模式)"""
|
"/health",
|
||||||
global _parser_instance
|
summary="健康检查",
|
||||||
if _parser_instance is None:
|
description="检查OCR服务的健康状态和配置信息",
|
||||||
logger.info(f"Initializing OCR parser with model_dir={MODEL_DIR}")
|
response_description="返回服务状态和模型目录信息"
|
||||||
_parser_instance = SimplePdfParser(model_dir=MODEL_DIR)
|
)
|
||||||
return _parser_instance
|
async def health_check():
|
||||||
|
"""
|
||||||
|
健康检查端点
|
||||||
class ParseResponse(BaseModel):
|
|
||||||
"""解析响应模型"""
|
用于检查OCR服务的运行状态和配置信息。
|
||||||
success: bool
|
|
||||||
message: str
|
Returns:
|
||||||
data: Optional[dict] = None
|
dict: 包含服务状态和模型目录的信息
|
||||||
|
"""
|
||||||
|
return {
|
||||||
@router.get(
|
"status": "healthy",
|
||||||
"/health",
|
"service": "OCR PDF Parser",
|
||||||
summary="健康检查",
|
"model_dir": MODEL_DIR
|
||||||
description="检查OCR服务的健康状态和配置信息",
|
}
|
||||||
response_description="返回服务状态和模型目录信息"
|
|
||||||
)
|
|
||||||
async def health_check():
|
@router.post(
|
||||||
"""
|
"/parse",
|
||||||
健康检查端点
|
response_model=ParseResponse,
|
||||||
|
summary="上传并解析PDF文件",
|
||||||
用于检查OCR服务的运行状态和配置信息。
|
description="上传PDF文件并通过OCR识别提取文本内容",
|
||||||
|
response_description="返回OCR识别结果"
|
||||||
Returns:
|
)
|
||||||
dict: 包含服务状态和模型目录的信息
|
async def parse_pdf_endpoint(
|
||||||
"""
|
file: UploadFile = File(..., description="PDF文件,支持上传任意PDF文档"),
|
||||||
return {
|
page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"),
|
||||||
"status": "healthy",
|
page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"),
|
||||||
"service": "OCR PDF Parser",
|
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)")
|
||||||
"model_dir": MODEL_DIR
|
):
|
||||||
}
|
"""
|
||||||
|
上传并解析PDF文件
|
||||||
|
|
||||||
@router.post(
|
通过上传PDF文件,使用OCR技术识别并提取其中的文本内容。
|
||||||
"/parse",
|
支持指定解析的页码范围,以及调整图像放大倍数以平衡识别精度和速度。
|
||||||
response_model=ParseResponse,
|
|
||||||
summary="上传并解析PDF文件",
|
Args:
|
||||||
description="上传PDF文件并通过OCR识别提取文本内容",
|
file: 上传的PDF文件(multipart/form-data格式)
|
||||||
response_description="返回OCR识别结果"
|
page_from: 起始页码(从1开始,最小值为1)
|
||||||
)
|
page_to: 结束页码(0表示解析到最后一页,最小值为0)
|
||||||
async def parse_pdf_endpoint(
|
zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢)
|
||||||
file: UploadFile = File(..., description="PDF文件,支持上传任意PDF文档"),
|
|
||||||
page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"),
|
Returns:
|
||||||
page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"),
|
ParseResponse: 包含解析结果的响应对象,包括:
|
||||||
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)")
|
- success: 是否成功
|
||||||
):
|
- message: 操作结果消息
|
||||||
"""
|
- data: OCR识别的文本内容和元数据
|
||||||
上传并解析PDF文件
|
|
||||||
|
Raises:
|
||||||
通过上传PDF文件,使用OCR技术识别并提取其中的文本内容。
|
HTTPException: 400 - 如果文件不是PDF格式或文件为空
|
||||||
支持指定解析的页码范围,以及调整图像放大倍数以平衡识别精度和速度。
|
HTTPException: 500 - 如果解析过程中发生错误
|
||||||
|
"""
|
||||||
Args:
|
if not file.filename.lower().endswith('.pdf'):
|
||||||
file: 上传的PDF文件(multipart/form-data格式)
|
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
||||||
page_from: 起始页码(从1开始,最小值为1)
|
|
||||||
page_to: 结束页码(0表示解析到最后一页,最小值为0)
|
# 保存上传的文件到临时目录
|
||||||
zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢)
|
temp_file = None
|
||||||
|
try:
|
||||||
Returns:
|
# 读取文件内容
|
||||||
ParseResponse: 包含解析结果的响应对象,包括:
|
content = await file.read()
|
||||||
- success: 是否成功
|
if not content:
|
||||||
- message: 操作结果消息
|
raise HTTPException(status_code=400, detail="文件为空")
|
||||||
- data: OCR识别的文本内容和元数据
|
|
||||||
|
# 创建临时文件
|
||||||
Raises:
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
||||||
HTTPException: 400 - 如果文件不是PDF格式或文件为空
|
tmp.write(content)
|
||||||
HTTPException: 500 - 如果解析过程中发生错误
|
temp_file = tmp.name
|
||||||
"""
|
|
||||||
if not file.filename.lower().endswith('.pdf'):
|
logger.info(f"Parsing PDF file: {file.filename}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
|
||||||
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
|
||||||
|
# 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行)
|
||||||
# 保存上传的文件到临时目录
|
parser = get_parser()
|
||||||
temp_file = None
|
result = await asyncio.to_thread(
|
||||||
try:
|
parser.parse_pdf,
|
||||||
# 读取文件内容
|
temp_file,
|
||||||
content = await file.read()
|
zoomin,
|
||||||
if not content:
|
page_from - 1, # 转换为从0开始的索引
|
||||||
raise HTTPException(status_code=400, detail="文件为空")
|
(page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引
|
||||||
|
None # callback
|
||||||
# 创建临时文件
|
)
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
|
||||||
tmp.write(content)
|
return ParseResponse(
|
||||||
temp_file = tmp.name
|
success=True,
|
||||||
|
message=f"成功解析PDF: {file.filename}",
|
||||||
logger.info(f"Parsing PDF file: {file.filename}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
|
data=result
|
||||||
|
)
|
||||||
# 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行)
|
|
||||||
parser = get_parser()
|
except Exception as e:
|
||||||
result = await asyncio.to_thread(
|
logger.error(f"Error parsing PDF: {str(e)}", exc_info=True)
|
||||||
parser.parse_pdf,
|
raise HTTPException(
|
||||||
temp_file,
|
status_code=500,
|
||||||
zoomin,
|
detail=f"解析PDF时发生错误: {str(e)}"
|
||||||
page_from - 1, # 转换为从0开始的索引
|
)
|
||||||
(page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引
|
|
||||||
None # callback
|
finally:
|
||||||
)
|
# 清理临时文件
|
||||||
|
if temp_file and os.path.exists(temp_file):
|
||||||
return ParseResponse(
|
try:
|
||||||
success=True,
|
os.unlink(temp_file)
|
||||||
message=f"成功解析PDF: {file.filename}",
|
except Exception as e:
|
||||||
data=result
|
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
@router.post(
|
||||||
logger.error(f"Error parsing PDF: {str(e)}", exc_info=True)
|
"/parse/bytes",
|
||||||
raise HTTPException(
|
response_model=ParseResponse,
|
||||||
status_code=500,
|
summary="通过二进制数据解析PDF",
|
||||||
detail=f"解析PDF时发生错误: {str(e)}"
|
description="直接通过二进制数据解析PDF文件,无需上传文件",
|
||||||
)
|
response_description="返回OCR识别结果"
|
||||||
|
)
|
||||||
finally:
|
async def parse_pdf_bytes(
|
||||||
# 清理临时文件
|
pdf_bytes: bytes = File(..., description="PDF文件的二进制数据(multipart/form-data格式)"),
|
||||||
if temp_file and os.path.exists(temp_file):
|
filename: str = Form("document.pdf", description="文件名(仅用于日志记录,不影响解析)"),
|
||||||
try:
|
page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"),
|
||||||
os.unlink(temp_file)
|
page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"),
|
||||||
except Exception as e:
|
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)")
|
||||||
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
|
):
|
||||||
|
"""
|
||||||
|
直接通过二进制数据解析PDF
|
||||||
@router.post(
|
|
||||||
"/parse/bytes",
|
适用于已获取PDF二进制数据的场景,无需文件上传步骤。
|
||||||
response_model=ParseResponse,
|
直接将PDF的二进制数据提交即可进行OCR识别。
|
||||||
summary="通过二进制数据解析PDF",
|
|
||||||
description="直接通过二进制数据解析PDF文件,无需上传文件",
|
Args:
|
||||||
response_description="返回OCR识别结果"
|
pdf_bytes: PDF文件的二进制数据(以文件形式提交)
|
||||||
)
|
filename: 文件名(仅用于日志记录,不影响实际解析过程)
|
||||||
async def parse_pdf_bytes(
|
page_from: 起始页码(从1开始,最小值为1)
|
||||||
pdf_bytes: bytes = File(..., description="PDF文件的二进制数据(multipart/form-data格式)"),
|
page_to: 结束页码(0表示解析到最后一页,最小值为0)
|
||||||
filename: str = Form("document.pdf", description="文件名(仅用于日志记录,不影响解析)"),
|
zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢)
|
||||||
page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"),
|
|
||||||
page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"),
|
Returns:
|
||||||
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)")
|
ParseResponse: 包含解析结果的响应对象
|
||||||
):
|
|
||||||
"""
|
Raises:
|
||||||
直接通过二进制数据解析PDF
|
HTTPException: 400 - 如果PDF数据为空
|
||||||
|
HTTPException: 500 - 如果解析过程中发生错误
|
||||||
适用于已获取PDF二进制数据的场景,无需文件上传步骤。
|
"""
|
||||||
直接将PDF的二进制数据提交即可进行OCR识别。
|
if not pdf_bytes:
|
||||||
|
raise HTTPException(status_code=400, detail="PDF数据为空")
|
||||||
Args:
|
|
||||||
pdf_bytes: PDF文件的二进制数据(以文件形式提交)
|
# 保存到临时文件
|
||||||
filename: 文件名(仅用于日志记录,不影响实际解析过程)
|
temp_file = None
|
||||||
page_from: 起始页码(从1开始,最小值为1)
|
try:
|
||||||
page_to: 结束页码(0表示解析到最后一页,最小值为0)
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
||||||
zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢)
|
tmp.write(pdf_bytes)
|
||||||
|
temp_file = tmp.name
|
||||||
Returns:
|
|
||||||
ParseResponse: 包含解析结果的响应对象
|
logger.info(f"Parsing PDF bytes (filename: {filename}), pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
|
||||||
|
|
||||||
Raises:
|
# 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行)
|
||||||
HTTPException: 400 - 如果PDF数据为空
|
parser = get_parser()
|
||||||
HTTPException: 500 - 如果解析过程中发生错误
|
result = await asyncio.to_thread(
|
||||||
"""
|
parser.parse_pdf,
|
||||||
if not pdf_bytes:
|
temp_file,
|
||||||
raise HTTPException(status_code=400, detail="PDF数据为空")
|
zoomin,
|
||||||
|
page_from - 1, # 转换为从0开始的索引
|
||||||
# 保存到临时文件
|
(page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引
|
||||||
temp_file = None
|
None # callback
|
||||||
try:
|
)
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
|
||||||
tmp.write(pdf_bytes)
|
return ParseResponse(
|
||||||
temp_file = tmp.name
|
success=True,
|
||||||
|
message=f"成功解析PDF: {filename}",
|
||||||
logger.info(f"Parsing PDF bytes (filename: {filename}), pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
|
data=result
|
||||||
|
)
|
||||||
# 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行)
|
|
||||||
parser = get_parser()
|
except Exception as e:
|
||||||
result = await asyncio.to_thread(
|
logger.error(f"Error parsing PDF bytes: {str(e)}", exc_info=True)
|
||||||
parser.parse_pdf,
|
raise HTTPException(
|
||||||
temp_file,
|
status_code=500,
|
||||||
zoomin,
|
detail=f"解析PDF时发生错误: {str(e)}"
|
||||||
page_from - 1, # 转换为从0开始的索引
|
)
|
||||||
(page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引
|
|
||||||
None # callback
|
finally:
|
||||||
)
|
# 清理临时文件
|
||||||
|
if temp_file and os.path.exists(temp_file):
|
||||||
return ParseResponse(
|
try:
|
||||||
success=True,
|
os.unlink(temp_file)
|
||||||
message=f"成功解析PDF: {filename}",
|
except Exception as e:
|
||||||
data=result
|
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
@router.post(
|
||||||
logger.error(f"Error parsing PDF bytes: {str(e)}", exc_info=True)
|
"/parse/path",
|
||||||
raise HTTPException(
|
response_model=ParseResponse,
|
||||||
status_code=500,
|
summary="通过文件路径解析PDF",
|
||||||
detail=f"解析PDF时发生错误: {str(e)}"
|
description="通过服务器本地文件路径解析PDF文件",
|
||||||
)
|
response_description="返回OCR识别结果"
|
||||||
|
)
|
||||||
finally:
|
async def parse_pdf_path(
|
||||||
# 清理临时文件
|
file_path: str = Form(..., description="PDF文件在服务器上的本地路径(必须是可访问的绝对路径)"),
|
||||||
if temp_file and os.path.exists(temp_file):
|
page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"),
|
||||||
try:
|
page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"),
|
||||||
os.unlink(temp_file)
|
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)")
|
||||||
except Exception as e:
|
):
|
||||||
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
|
"""
|
||||||
|
通过文件路径解析PDF
|
||||||
|
|
||||||
@router.post(
|
适用于PDF文件已经存在于服务器上的场景。
|
||||||
"/parse/path",
|
通过提供文件路径直接进行OCR识别,无需上传文件。
|
||||||
response_model=ParseResponse,
|
|
||||||
summary="通过文件路径解析PDF",
|
Args:
|
||||||
description="通过服务器本地文件路径解析PDF文件",
|
file_path: PDF文件在服务器上的本地路径(必须是服务器可访问的绝对路径)
|
||||||
response_description="返回OCR识别结果"
|
page_from: 起始页码(从1开始,最小值为1)
|
||||||
)
|
page_to: 结束页码(0表示解析到最后一页,最小值为0)
|
||||||
async def parse_pdf_path(
|
zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢)
|
||||||
file_path: str = Form(..., description="PDF文件在服务器上的本地路径(必须是可访问的绝对路径)"),
|
|
||||||
page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"),
|
Returns:
|
||||||
page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"),
|
ParseResponse: 包含解析结果的响应对象
|
||||||
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)")
|
|
||||||
):
|
Raises:
|
||||||
"""
|
HTTPException: 400 - 如果文件不是PDF格式
|
||||||
通过文件路径解析PDF
|
HTTPException: 404 - 如果文件不存在
|
||||||
|
HTTPException: 500 - 如果解析过程中发生错误
|
||||||
适用于PDF文件已经存在于服务器上的场景。
|
|
||||||
通过提供文件路径直接进行OCR识别,无需上传文件。
|
Note:
|
||||||
|
此端点需要确保提供的文件路径在服务器上可访问。
|
||||||
Args:
|
建议仅在内网环境或受信任的环境中使用,避免路径遍历安全风险。
|
||||||
file_path: PDF文件在服务器上的本地路径(必须是服务器可访问的绝对路径)
|
"""
|
||||||
page_from: 起始页码(从1开始,最小值为1)
|
if not os.path.exists(file_path):
|
||||||
page_to: 结束页码(0表示解析到最后一页,最小值为0)
|
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
|
||||||
zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢)
|
|
||||||
|
if not file_path.lower().endswith('.pdf'):
|
||||||
Returns:
|
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
||||||
ParseResponse: 包含解析结果的响应对象
|
|
||||||
|
try:
|
||||||
Raises:
|
logger.info(f"Parsing PDF from path: {file_path}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
|
||||||
HTTPException: 400 - 如果文件不是PDF格式
|
|
||||||
HTTPException: 404 - 如果文件不存在
|
# 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行)
|
||||||
HTTPException: 500 - 如果解析过程中发生错误
|
parser = get_parser()
|
||||||
|
result = await asyncio.to_thread(
|
||||||
Note:
|
parser.parse_pdf,
|
||||||
此端点需要确保提供的文件路径在服务器上可访问。
|
file_path,
|
||||||
建议仅在内网环境或受信任的环境中使用,避免路径遍历安全风险。
|
zoomin,
|
||||||
"""
|
page_from - 1, # 转换为从0开始的索引
|
||||||
if not os.path.exists(file_path):
|
(page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引
|
||||||
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
|
None # callback
|
||||||
|
)
|
||||||
if not file_path.lower().endswith('.pdf'):
|
|
||||||
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
return ParseResponse(
|
||||||
|
success=True,
|
||||||
try:
|
message=f"成功解析PDF: {file_path}",
|
||||||
logger.info(f"Parsing PDF from path: {file_path}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}")
|
data=result
|
||||||
|
)
|
||||||
# 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行)
|
|
||||||
parser = get_parser()
|
except Exception as e:
|
||||||
result = await asyncio.to_thread(
|
logger.error(f"Error parsing PDF from path: {str(e)}", exc_info=True)
|
||||||
parser.parse_pdf,
|
raise HTTPException(
|
||||||
file_path,
|
status_code=500,
|
||||||
zoomin,
|
detail=f"解析PDF时发生错误: {str(e)}"
|
||||||
page_from - 1, # 转换为从0开始的索引
|
)
|
||||||
(page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引
|
|
||||||
None # callback
|
|
||||||
)
|
@router.post(
|
||||||
|
"/parse_into_bboxes",
|
||||||
return ParseResponse(
|
summary="解析PDF并返回边界框",
|
||||||
success=True,
|
description="解析PDF文件并返回文本边界框信息,用于文档结构化处理",
|
||||||
message=f"成功解析PDF: {file_path}",
|
response_description="返回包含文本边界框的列表"
|
||||||
data=result
|
)
|
||||||
)
|
async def parse_into_bboxes_endpoint(
|
||||||
|
pdf_bytes: bytes = File(..., description="PDF文件的二进制数据"),
|
||||||
except Exception as e:
|
filename: str = Form("document.pdf", description="文件名(仅用于日志)"),
|
||||||
logger.error(f"Error parsing PDF from path: {str(e)}", exc_info=True)
|
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,默认为3)")
|
||||||
raise HTTPException(
|
):
|
||||||
status_code=500,
|
"""
|
||||||
detail=f"解析PDF时发生错误: {str(e)}"
|
解析PDF并返回边界框
|
||||||
)
|
|
||||||
|
此接口用于将PDF文档解析为结构化文本边界框,每个边界框包含:
|
||||||
|
- 文本内容
|
||||||
|
- 页面编号
|
||||||
|
- 坐标信息(x0, x1, top, bottom)
|
||||||
|
- 布局类型(如 text, table, figure 等)
|
||||||
|
- 图像数据(如果有)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_bytes: PDF文件的二进制数据
|
||||||
|
filename: 文件名(仅用于日志记录)
|
||||||
|
zoomin: 图像放大倍数(1-5之间)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: 包含解析结果的对象,data字段为边界框列表
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: 400 - 如果PDF数据为空
|
||||||
|
HTTPException: 500 - 如果解析过程中发生错误
|
||||||
|
"""
|
||||||
|
if not pdf_bytes:
|
||||||
|
raise HTTPException(status_code=400, detail="PDF数据为空")
|
||||||
|
|
||||||
|
temp_file = None
|
||||||
|
try:
|
||||||
|
# 保存到临时文件
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
||||||
|
tmp.write(pdf_bytes)
|
||||||
|
temp_file = tmp.name
|
||||||
|
|
||||||
|
logger.info(f"Parsing PDF into bboxes: {filename}, zoomin={zoomin}")
|
||||||
|
|
||||||
|
# 定义一个简单的callback包装器,用于处理进度回调(记录日志)
|
||||||
|
def progress_callback(prog, msg):
|
||||||
|
logger.info(f"Progress: {prog:.2%} - {msg}")
|
||||||
|
|
||||||
|
parser = get_parser()
|
||||||
|
result = await asyncio.to_thread(
|
||||||
|
parser.parse_into_bboxes,
|
||||||
|
temp_file,
|
||||||
|
progress_callback,
|
||||||
|
zoomin
|
||||||
|
)
|
||||||
|
|
||||||
|
# 将图像数据转换为base64或None
|
||||||
|
processed_result = []
|
||||||
|
for bbox in result:
|
||||||
|
processed_bbox = dict(bbox)
|
||||||
|
# 如果有图像,转换为base64(如果需要的话,可以在这里处理)
|
||||||
|
# 但为了保持兼容性,我们保留原始格式
|
||||||
|
processed_result.append(processed_bbox)
|
||||||
|
|
||||||
|
return ParseResponse(
|
||||||
|
success=True,
|
||||||
|
message=f"成功解析PDF为边界框: {filename}",
|
||||||
|
data={"bboxes": processed_result}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing PDF into bboxes: {str(e)}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"解析PDF为边界框时发生错误: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# 清理临时文件
|
||||||
|
if temp_file and os.path.exists(temp_file):
|
||||||
|
try:
|
||||||
|
os.unlink(temp_file)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
class TextRequest(BaseModel):
|
||||||
|
"""文本处理请求模型"""
|
||||||
|
text: str = Field(..., description="需要处理的文本内容")
|
||||||
|
|
||||||
|
|
||||||
|
class RemoveTagResponse(BaseModel):
|
||||||
|
"""移除标签响应模型"""
|
||||||
|
success: bool
|
||||||
|
message: str
|
||||||
|
text: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/remove_tag",
|
||||||
|
response_model=RemoveTagResponse,
|
||||||
|
summary="移除文本中的位置标签",
|
||||||
|
description="从文本中移除PDF解析生成的位置标签(格式:@@页码\t坐标##)",
|
||||||
|
response_description="返回移除标签后的文本"
|
||||||
|
)
|
||||||
|
async def remove_tag_endpoint(request: TextRequest):
|
||||||
|
"""
|
||||||
|
移除文本中的位置标签
|
||||||
|
|
||||||
|
此接口用于从包含位置标签的文本中移除标签信息。
|
||||||
|
位置标签格式为:@@页码\t坐标##,例如:@@1\t100.0\t200.0\t50.0\t60.0##
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: 包含待处理文本的请求对象
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RemoveTagResponse: 包含处理结果的响应对象
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: 400 - 如果文本为空
|
||||||
|
"""
|
||||||
|
if not request.text:
|
||||||
|
raise HTTPException(status_code=400, detail="文本内容不能为空")
|
||||||
|
|
||||||
|
try:
|
||||||
|
cleaned_text = SimplePdfParser.remove_tag(request.text)
|
||||||
|
|
||||||
|
return RemoveTagResponse(
|
||||||
|
success=True,
|
||||||
|
message="成功移除文本标签",
|
||||||
|
text=cleaned_text
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error removing tag: {str(e)}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"移除标签时发生错误: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractPositionsResponse(BaseModel):
|
||||||
|
"""提取位置信息响应模型"""
|
||||||
|
success: bool
|
||||||
|
message: str
|
||||||
|
positions: Optional[list] = None
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/extract_positions",
|
||||||
|
response_model=ExtractPositionsResponse,
|
||||||
|
summary="从文本中提取位置信息",
|
||||||
|
description="从包含位置标签的文本中提取所有位置坐标信息",
|
||||||
|
response_description="返回提取到的位置信息列表"
|
||||||
|
)
|
||||||
|
async def extract_positions_endpoint(request: TextRequest):
|
||||||
|
"""
|
||||||
|
从文本中提取位置信息
|
||||||
|
|
||||||
|
此接口用于从包含位置标签的文本中提取所有位置坐标信息。
|
||||||
|
位置标签格式为:@@页码\t坐标##
|
||||||
|
|
||||||
|
返回的位置信息格式为:
|
||||||
|
[
|
||||||
|
([页码列表], left, right, top, bottom),
|
||||||
|
...
|
||||||
|
]
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: 包含待处理文本的请求对象
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractPositionsResponse: 包含提取结果的响应对象
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: 400 - 如果文本为空
|
||||||
|
"""
|
||||||
|
if not request.text:
|
||||||
|
raise HTTPException(status_code=400, detail="文本内容不能为空")
|
||||||
|
|
||||||
|
try:
|
||||||
|
positions = SimplePdfParser.extract_positions(request.text)
|
||||||
|
|
||||||
|
# 将位置信息转换为可序列化的格式
|
||||||
|
serializable_positions = [
|
||||||
|
{
|
||||||
|
"page_numbers": pos[0],
|
||||||
|
"left": pos[1],
|
||||||
|
"right": pos[2],
|
||||||
|
"top": pos[3],
|
||||||
|
"bottom": pos[4]
|
||||||
|
}
|
||||||
|
for pos in positions
|
||||||
|
]
|
||||||
|
|
||||||
|
return ExtractPositionsResponse(
|
||||||
|
success=True,
|
||||||
|
message=f"成功提取 {len(positions)} 个位置信息",
|
||||||
|
positions=serializable_positions
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting positions: {str(e)}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"提取位置信息时发生错误: {str(e)}"
|
||||||
|
)
|
||||||
21
ocr/main.py
21
ocr/main.py
@@ -29,25 +29,8 @@ import uvicorn
|
|||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
# 处理直接运行时的导入问题
|
from ocr.api import ocr_router
|
||||||
# 当直接运行 python ocr/main.py 时,__package__ 为 None
|
from ocr.config import MODEL_DIR
|
||||||
# 当作为模块运行时(python -m ocr.main),__package__ 为 'ocr'
|
|
||||||
try:
|
|
||||||
_package = __package__
|
|
||||||
except NameError:
|
|
||||||
_package = None
|
|
||||||
|
|
||||||
if _package is None:
|
|
||||||
# 直接运行脚本时,添加父目录到路径
|
|
||||||
parent_dir = Path(__file__).parent.parent
|
|
||||||
if str(parent_dir) not in sys.path:
|
|
||||||
sys.path.insert(0, str(parent_dir))
|
|
||||||
from api import router as ocr_router
|
|
||||||
from config import MODEL_DIR
|
|
||||||
else:
|
|
||||||
# 作为模块导入时使用相对导入
|
|
||||||
from api import router as ocr_router
|
|
||||||
from config import MODEL_DIR
|
|
||||||
|
|
||||||
# 配置日志
|
# 配置日志
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
|
|||||||
1658
ocr/pdf_parser.py
1658
ocr/pdf_parser.py
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user