将ocr解析模块独立出来

This commit is contained in:
dzr
2025-10-31 17:50:25 +08:00
parent 4318179904
commit 4603a86df4
5 changed files with 1870 additions and 2140 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -32,22 +32,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
try:
_package = __package__
except NameError:
_package = None
if _package is None:
# 直接运行时,添加父目录到路径并使用绝对导入
parent_dir = Path(__file__).parent.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from ocr.ocr import OCR, TextDetector, TextRecognizer
from ocr.pdf_parser import SimplePdfParser
else:
# 作为模块导入时使用相对导入
from .ocr import OCR, TextDetector, TextRecognizer
from .pdf_parser import SimplePdfParser
__all__ = ['OCR', 'TextDetector', 'TextRecognizer', 'SimplePdfParser'] __all__ = ['OCR', 'TextDetector', 'TextRecognizer', 'SimplePdfParser']

View File

@@ -28,30 +28,14 @@ from typing import Optional
from fastapi import APIRouter, File, Form, HTTPException, UploadFile from fastapi import APIRouter, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from pydantic import BaseModel from pydantic import BaseModel, Field
# 处理导入问题:支持直接运行和模块导入 from ocr import SimplePdfParser
from ocr.config import MODEL_DIR
try:
_package = __package__
except NameError:
_package = None
if _package is None:
# 直接运行时,添加父目录到路径并使用绝对导入
parent_dir = Path(__file__).parent.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from ocr.pdf_parser import SimplePdfParser
from ocr.config import MODEL_DIR
else:
# 作为模块导入时使用相对导入
from pdf_parser import SimplePdfParser
from config import MODEL_DIR
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"]) ocr_router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"])
# 全局解析器实例(懒加载) # 全局解析器实例(懒加载)
_parser_instance: Optional[SimplePdfParser] = None _parser_instance: Optional[SimplePdfParser] = None
@@ -330,3 +314,212 @@ async def parse_pdf_path(
detail=f"解析PDF时发生错误: {str(e)}" detail=f"解析PDF时发生错误: {str(e)}"
) )
@router.post(
"/parse_into_bboxes",
summary="解析PDF并返回边界框",
description="解析PDF文件并返回文本边界框信息用于文档结构化处理",
response_description="返回包含文本边界框的列表"
)
async def parse_into_bboxes_endpoint(
pdf_bytes: bytes = File(..., description="PDF文件的二进制数据"),
filename: str = Form("document.pdf", description="文件名(仅用于日志)"),
zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数1-5默认为3")
):
"""
解析PDF并返回边界框
此接口用于将PDF文档解析为结构化文本边界框每个边界框包含
- 文本内容
- 页面编号
- 坐标信息x0, x1, top, bottom
- 布局类型(如 text, table, figure 等)
- 图像数据(如果有)
Args:
pdf_bytes: PDF文件的二进制数据
filename: 文件名(仅用于日志记录)
zoomin: 图像放大倍数1-5之间
Returns:
dict: 包含解析结果的对象data字段为边界框列表
Raises:
HTTPException: 400 - 如果PDF数据为空
HTTPException: 500 - 如果解析过程中发生错误
"""
if not pdf_bytes:
raise HTTPException(status_code=400, detail="PDF数据为空")
temp_file = None
try:
# 保存到临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
tmp.write(pdf_bytes)
temp_file = tmp.name
logger.info(f"Parsing PDF into bboxes: {filename}, zoomin={zoomin}")
# 定义一个简单的callback包装器用于处理进度回调记录日志
def progress_callback(prog, msg):
logger.info(f"Progress: {prog:.2%} - {msg}")
parser = get_parser()
result = await asyncio.to_thread(
parser.parse_into_bboxes,
temp_file,
progress_callback,
zoomin
)
# 将图像数据转换为base64或None
processed_result = []
for bbox in result:
processed_bbox = dict(bbox)
# 如果有图像转换为base64如果需要的话可以在这里处理
# 但为了保持兼容性,我们保留原始格式
processed_result.append(processed_bbox)
return ParseResponse(
success=True,
message=f"成功解析PDF为边界框: {filename}",
data={"bboxes": processed_result}
)
except Exception as e:
logger.error(f"Error parsing PDF into bboxes: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"解析PDF为边界框时发生错误: {str(e)}"
)
finally:
# 清理临时文件
if temp_file and os.path.exists(temp_file):
try:
os.unlink(temp_file)
except Exception as e:
logger.warning(f"Failed to delete temp file {temp_file}: {e}")
class TextRequest(BaseModel):
"""文本处理请求模型"""
text: str = Field(..., description="需要处理的文本内容")
class RemoveTagResponse(BaseModel):
"""移除标签响应模型"""
success: bool
message: str
text: Optional[str] = None
@router.post(
"/remove_tag",
response_model=RemoveTagResponse,
summary="移除文本中的位置标签",
description="从文本中移除PDF解析生成的位置标签格式@@页码\t坐标##",
response_description="返回移除标签后的文本"
)
async def remove_tag_endpoint(request: TextRequest):
"""
移除文本中的位置标签
此接口用于从包含位置标签的文本中移除标签信息。
位置标签格式为:@@页码\t坐标##,例如:@@1\t100.0\t200.0\t50.0\t60.0##
Args:
request: 包含待处理文本的请求对象
Returns:
RemoveTagResponse: 包含处理结果的响应对象
Raises:
HTTPException: 400 - 如果文本为空
"""
if not request.text:
raise HTTPException(status_code=400, detail="文本内容不能为空")
try:
cleaned_text = SimplePdfParser.remove_tag(request.text)
return RemoveTagResponse(
success=True,
message="成功移除文本标签",
text=cleaned_text
)
except Exception as e:
logger.error(f"Error removing tag: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"移除标签时发生错误: {str(e)}"
)
class ExtractPositionsResponse(BaseModel):
"""提取位置信息响应模型"""
success: bool
message: str
positions: Optional[list] = None
@router.post(
"/extract_positions",
response_model=ExtractPositionsResponse,
summary="从文本中提取位置信息",
description="从包含位置标签的文本中提取所有位置坐标信息",
response_description="返回提取到的位置信息列表"
)
async def extract_positions_endpoint(request: TextRequest):
"""
从文本中提取位置信息
此接口用于从包含位置标签的文本中提取所有位置坐标信息。
位置标签格式为:@@页码\t坐标##
返回的位置信息格式为:
[
([页码列表], left, right, top, bottom),
...
]
Args:
request: 包含待处理文本的请求对象
Returns:
ExtractPositionsResponse: 包含提取结果的响应对象
Raises:
HTTPException: 400 - 如果文本为空
"""
if not request.text:
raise HTTPException(status_code=400, detail="文本内容不能为空")
try:
positions = SimplePdfParser.extract_positions(request.text)
# 将位置信息转换为可序列化的格式
serializable_positions = [
{
"page_numbers": pos[0],
"left": pos[1],
"right": pos[2],
"top": pos[3],
"bottom": pos[4]
}
for pos in positions
]
return ExtractPositionsResponse(
success=True,
message=f"成功提取 {len(positions)} 个位置信息",
positions=serializable_positions
)
except Exception as e:
logger.error(f"Error extracting positions: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"提取位置信息时发生错误: {str(e)}"
)

View File

@@ -29,25 +29,8 @@ import uvicorn
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
# 处理直接运行时的导入问题 from ocr.api import ocr_router
# 当直接运行 python ocr/main.py 时__package__ 为 None from ocr.config import MODEL_DIR
# 当作为模块运行时python -m ocr.main__package__ 为 'ocr'
try:
_package = __package__
except NameError:
_package = None
if _package is None:
# 直接运行脚本时,添加父目录到路径
parent_dir = Path(__file__).parent.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from api import router as ocr_router
from config import MODEL_DIR
else:
# 作为模块导入时使用相对导入
from api import router as ocr_router
from config import MODEL_DIR
# 配置日志 # 配置日志
logging.basicConfig( logging.basicConfig(

File diff suppressed because it is too large Load Diff