将ocr解析模块独立出来

This commit is contained in:
dzr
2025-10-31 17:50:25 +08:00
parent 4318179904
commit 4603a86df4
5 changed files with 1870 additions and 2140 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -32,22 +32,6 @@
import sys
from pathlib import Path
try:
_package = __package__
except NameError:
_package = None
if _package is None:
# 直接运行时,添加父目录到路径并使用绝对导入
parent_dir = Path(__file__).parent.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from ocr.ocr import OCR, TextDetector, TextRecognizer
from ocr.pdf_parser import SimplePdfParser
else:
# 作为模块导入时使用相对导入
from .ocr import OCR, TextDetector, TextRecognizer
from .pdf_parser import SimplePdfParser
__all__ = ['OCR', 'TextDetector', 'TextRecognizer', 'SimplePdfParser']

View File

@@ -28,30 +28,14 @@ from typing import Optional
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from pydantic import BaseModel, Field
# 处理导入问题:支持直接运行和模块导入
try:
_package = __package__
except NameError:
_package = None
if _package is None:
# 直接运行时,添加父目录到路径并使用绝对导入
parent_dir = Path(__file__).parent.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from ocr.pdf_parser import SimplePdfParser
from ocr.config import MODEL_DIR
else:
# 作为模块导入时使用相对导入
from pdf_parser import SimplePdfParser
from config import MODEL_DIR
from ocr import SimplePdfParser
from ocr.config import MODEL_DIR
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"])
ocr_router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"])
# 全局解析器实例(懒加载)
_parser_instance: Optional[SimplePdfParser] = None
@@ -330,3 +314,212 @@ async def parse_pdf_path(
detail=f"解析PDF时发生错误: {str(e)}"
)
@router.post(
"/parse_into_bboxes",
summary="解析PDF并返回边界框",
description="解析PDF文件并返回文本边界框信息用于文档结构化处理",
response_description="返回包含文本边界框的列表"
)
async def parse_into_bboxes_endpoint(
pdf_bytes: bytes = File(..., description="PDF文件的二进制数据"),
filename: str = Form("document.pdf", description="文件名(仅用于日志)"),
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数1-5默认为3")
):
"""
解析PDF并返回边界框
此接口用于将PDF文档解析为结构化文本边界框每个边界框包含
- 文本内容
- 页面编号
- 坐标信息x0, x1, top, bottom
- 布局类型(如 text, table, figure 等)
- 图像数据(如果有)
Args:
pdf_bytes: PDF文件的二进制数据
filename: 文件名(仅用于日志记录)
zoomin: 图像放大倍数1-5之间
Returns:
dict: 包含解析结果的对象data字段为边界框列表
Raises:
HTTPException: 400 - 如果PDF数据为空
HTTPException: 500 - 如果解析过程中发生错误
"""
if not pdf_bytes:
raise HTTPException(status_code=400, detail="PDF数据为空")
temp_file = None
try:
# 保存到临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
tmp.write(pdf_bytes)
temp_file = tmp.name
logger.info(f"Parsing PDF into bboxes: {filename}, zoomin={zoomin}")
# 定义一个简单的callback包装器用于处理进度回调记录日志
def progress_callback(prog, msg):
logger.info(f"Progress: {prog:.2%} - {msg}")
parser = get_parser()
result = await asyncio.to_thread(
parser.parse_into_bboxes,
temp_file,
progress_callback,
zoomin
)
# 将图像数据转换为base64或None
processed_result = []
for bbox in result:
processed_bbox = dict(bbox)
# 如果有图像转换为base64如果需要的话可以在这里处理
# 但为了保持兼容性,我们保留原始格式
processed_result.append(processed_bbox)
return ParseResponse(
success=True,
message=f"成功解析PDF为边界框: {filename}",
data={"bboxes": processed_result}
)
except Exception as e:
logger.error(f"Error parsing PDF into bboxes: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"解析PDF为边界框时发生错误: {str(e)}"
)
finally:
# 清理临时文件
if temp_file and os.path.exists(temp_file):
try:
os.unlink(temp_file)
except Exception as e:
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
class TextRequest(BaseModel):
"""文本处理请求模型"""
text: str = Field(..., description="需要处理的文本内容")
class RemoveTagResponse(BaseModel):
"""移除标签响应模型"""
success: bool
message: str
text: Optional[str] = None
@router.post(
"/remove_tag",
response_model=RemoveTagResponse,
summary="移除文本中的位置标签",
description="从文本中移除PDF解析生成的位置标签格式@@页码\t坐标##",
response_description="返回移除标签后的文本"
)
async def remove_tag_endpoint(request: TextRequest):
"""
移除文本中的位置标签
此接口用于从包含位置标签的文本中移除标签信息。
位置标签格式为:@@页码\t坐标##,例如:@@1\t100.0\t200.0\t50.0\t60.0##
Args:
request: 包含待处理文本的请求对象
Returns:
RemoveTagResponse: 包含处理结果的响应对象
Raises:
HTTPException: 400 - 如果文本为空
"""
if not request.text:
raise HTTPException(status_code=400, detail="文本内容不能为空")
try:
cleaned_text = SimplePdfParser.remove_tag(request.text)
return RemoveTagResponse(
success=True,
message="成功移除文本标签",
text=cleaned_text
)
except Exception as e:
logger.error(f"Error removing tag: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"移除标签时发生错误: {str(e)}"
)
class ExtractPositionsResponse(BaseModel):
"""提取位置信息响应模型"""
success: bool
message: str
positions: Optional[list] = None
@router.post(
"/extract_positions",
response_model=ExtractPositionsResponse,
summary="从文本中提取位置信息",
description="从包含位置标签的文本中提取所有位置坐标信息",
response_description="返回提取到的位置信息列表"
)
async def extract_positions_endpoint(request: TextRequest):
"""
从文本中提取位置信息
此接口用于从包含位置标签的文本中提取所有位置坐标信息。
位置标签格式为:@@页码\t坐标##
返回的位置信息格式为:
[
([页码列表], left, right, top, bottom),
...
]
Args:
request: 包含待处理文本的请求对象
Returns:
ExtractPositionsResponse: 包含提取结果的响应对象
Raises:
HTTPException: 400 - 如果文本为空
"""
if not request.text:
raise HTTPException(status_code=400, detail="文本内容不能为空")
try:
positions = SimplePdfParser.extract_positions(request.text)
# 将位置信息转换为可序列化的格式
serializable_positions = [
{
"page_numbers": pos[0],
"left": pos[1],
"right": pos[2],
"top": pos[3],
"bottom": pos[4]
}
for pos in positions
]
return ExtractPositionsResponse(
success=True,
message=f"成功提取 {len(positions)} 个位置信息",
positions=serializable_positions
)
except Exception as e:
logger.error(f"Error extracting positions: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"提取位置信息时发生错误: {str(e)}"
)

View File

@@ -29,25 +29,8 @@ import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
# 处理直接运行时的导入问题
# 当直接运行 python ocr/main.py 时__package__ 为 None
# 当作为模块运行时python -m ocr.main__package__ 为 'ocr'
try:
_package = __package__
except NameError:
_package = None
if _package is None:
# 直接运行脚本时,添加父目录到路径
parent_dir = Path(__file__).parent.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from api import router as ocr_router
from config import MODEL_DIR
else:
# 作为模块导入时使用相对导入
from api import router as ocr_router
from config import MODEL_DIR
from ocr.api import ocr_router
from ocr.config import MODEL_DIR
# 配置日志
logging.basicConfig(

File diff suppressed because it is too large Load Diff