将ocr解析模块独立出来
This commit is contained in:
339
ocr/pdf_parser.py
Normal file
339
ocr/pdf_parser.py
Normal file
@@ -0,0 +1,339 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""
|
||||
简化的PDF解析器,只使用OCR处理PDF文档
|
||||
|
||||
从 RAGFlow 的 RAGFlowPdfParser 中提取OCR相关功能,移除了:
|
||||
- 布局识别(Layout Recognition)
|
||||
- 表格结构识别(Table Structure Recognition)
|
||||
- 文本合并和语义分析
|
||||
- RAG相关功能
|
||||
|
||||
只保留:
|
||||
- PDF转图片
|
||||
- OCR文本检测和识别
|
||||
- 基本的文本和位置信息返回
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from timeit import default_timer as timer
|
||||
|
||||
import numpy as np
|
||||
import pdfplumber
|
||||
import trio
|
||||
|
||||
# 处理导入问题:支持直接运行和模块导入
|
||||
try:
|
||||
_package = __package__
|
||||
except NameError:
|
||||
_package = None
|
||||
|
||||
if _package is None:
|
||||
# 直接运行时,添加父目录到路径并使用绝对导入
|
||||
parent_dir = Path(__file__).parent.parent
|
||||
if str(parent_dir) not in sys.path:
|
||||
sys.path.insert(0, str(parent_dir))
|
||||
from ocr.config import PARALLEL_DEVICES
|
||||
from ocr.ocr import OCR
|
||||
else:
|
||||
# 作为模块导入时使用相对导入
|
||||
from config import PARALLEL_DEVICES
|
||||
from ocr import OCR
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
|
||||
|
||||
class SimplePdfParser:
|
||||
"""
|
||||
简化的PDF解析器,只使用OCR处理PDF
|
||||
|
||||
使用方法:
|
||||
parser = SimplePdfParser()
|
||||
result = parser.parse_pdf("file.pdf") # 或传入二进制数据
|
||||
# result 格式:
|
||||
# {
|
||||
# "pages": [
|
||||
# {
|
||||
# "page_number": 1,
|
||||
# "boxes": [
|
||||
# {
|
||||
# "text": "识别到的文本",
|
||||
# "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
|
||||
# "confidence": 0.95
|
||||
# },
|
||||
# ...
|
||||
# ]
|
||||
# },
|
||||
# ...
|
||||
# ]
|
||||
# }
|
||||
"""
|
||||
|
||||
def __init__(self, model_dir=None):
|
||||
"""
|
||||
初始化PDF解析器
|
||||
|
||||
Args:
|
||||
model_dir: OCR模型目录,如果为None则使用默认路径
|
||||
"""
|
||||
self.ocr = OCR(model_dir=model_dir)
|
||||
self.parallel_limiter = None
|
||||
if PARALLEL_DEVICES > 1:
|
||||
self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)]
|
||||
|
||||
def __ocr_page(self, page_num, img, zoomin=3, device_id=None):
|
||||
"""
|
||||
对单页进行OCR处理
|
||||
|
||||
Args:
|
||||
page_num: 页码
|
||||
img: PIL Image对象
|
||||
zoomin: 放大倍数(用于坐标缩放)
|
||||
device_id: GPU设备ID
|
||||
|
||||
Returns:
|
||||
list: OCR结果列表,每个元素为 {"text": str, "bbox": list, "confidence": float}
|
||||
"""
|
||||
start = timer()
|
||||
img_np = np.array(img)
|
||||
|
||||
# 文本检测
|
||||
# detect方法返回: zip对象,格式为 (box_coords, (text, score))
|
||||
# 但检测阶段text和score都是默认值,需要后续识别
|
||||
detection_result = self.ocr.detect(img_np, device_id)
|
||||
|
||||
if detection_result is None:
|
||||
return []
|
||||
|
||||
# 转换为列表并提取box坐标
|
||||
# detect返回的格式是zip,每个元素是 (box_coords, (text, score))
|
||||
# 在检测阶段,text是空字符串,score是0
|
||||
bxs = list(detection_result)
|
||||
|
||||
logging.info(f"Page {page_num}: OCR detection found {len(bxs)} boxes in {timer() - start:.2f}s")
|
||||
|
||||
if not bxs:
|
||||
return []
|
||||
|
||||
# 解析检测结果并准备识别
|
||||
boxes_to_reg = []
|
||||
|
||||
start = timer()
|
||||
for box_coords, _, _ in bxs:
|
||||
# box_coords 是四边形坐标: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]]
|
||||
# 转换为原始坐标(考虑zoomin)
|
||||
box_coords_np = np.array(box_coords, dtype=np.float32)
|
||||
original_coords = box_coords_np / zoomin # 缩放回原始坐标
|
||||
|
||||
# 裁剪图像用于识别
|
||||
# 使用放大后的坐标裁剪(因为img_np是放大后的图像)
|
||||
crop_box = box_coords_np
|
||||
crop_img = self.ocr.get_rotate_crop_image(img_np, crop_box)
|
||||
boxes_to_reg.append({
|
||||
"bbox": original_coords.tolist(),
|
||||
"crop_img": crop_img
|
||||
})
|
||||
|
||||
# 批量识别文本
|
||||
ocr_results = []
|
||||
if boxes_to_reg:
|
||||
crop_imgs = [b["crop_img"] for b in boxes_to_reg]
|
||||
texts = self.ocr.recognize_batch(crop_imgs, device_id)
|
||||
|
||||
# 组装结果
|
||||
for i, b in enumerate(boxes_to_reg):
|
||||
if i < len(texts) and texts[i]: # 过滤空文本
|
||||
ocr_results.append({
|
||||
"text": texts[i],
|
||||
"bbox": b["bbox"],
|
||||
"confidence": 0.9 # 简化版本,不计算具体置信度
|
||||
})
|
||||
|
||||
logging.info(f"Page {page_num}: OCR recognition {len(ocr_results)} boxes cost {timer() - start:.2f}s")
|
||||
return ocr_results
|
||||
|
||||
async def __ocr_page_async(self, page_num, img, zoomin, device_id, limiter, callback):
|
||||
"""
|
||||
异步OCR处理单页
|
||||
|
||||
Args:
|
||||
page_num: 页码
|
||||
img: PIL Image对象
|
||||
zoomin: 放大倍数
|
||||
device_id: GPU设备ID
|
||||
limiter: 并发限制器
|
||||
callback: 进度回调函数
|
||||
"""
|
||||
if limiter:
|
||||
async with limiter:
|
||||
result = await trio.to_thread.run_sync(
|
||||
lambda: self.__ocr_page(page_num, img, zoomin, device_id)
|
||||
)
|
||||
else:
|
||||
result = await trio.to_thread.run_sync(
|
||||
lambda: self.__ocr_page(page_num, img, zoomin, device_id)
|
||||
)
|
||||
|
||||
if callback and page_num % 5 == 0:
|
||||
callback(prog=page_num * 0.9 / 100, msg=f"Processing page {page_num}...")
|
||||
|
||||
return result
|
||||
|
||||
def __convert_pdf_to_images(self, pdf_source, zoomin=3, page_from=0, page_to=299):
|
||||
"""
|
||||
将PDF转换为图片
|
||||
|
||||
Args:
|
||||
pdf_source: PDF文件路径(str)或二进制数据(bytes)
|
||||
zoomin: 放大倍数,默认3(72*3=216 DPI)
|
||||
page_from: 起始页码(从0开始)
|
||||
page_to: 结束页码
|
||||
|
||||
Returns:
|
||||
list: PIL Image对象列表
|
||||
"""
|
||||
start = timer()
|
||||
page_images = []
|
||||
|
||||
try:
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
pdf = pdfplumber.open(pdf_source) if isinstance(pdf_source, str) else pdfplumber.open(BytesIO(pdf_source))
|
||||
try:
|
||||
# 转换为图片,resolution = 72 * zoomin
|
||||
page_images = [
|
||||
p.to_image(resolution=72 * zoomin, antialias=True).annotated
|
||||
for i, p in enumerate(pdf.pages[page_from:page_to])
|
||||
]
|
||||
pdf.close()
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to convert PDF pages {page_from}-{page_to}: {str(e)}")
|
||||
if hasattr(pdf, 'close'):
|
||||
pdf.close()
|
||||
except Exception as e:
|
||||
logging.exception(f"Error converting PDF to images: {str(e)}")
|
||||
|
||||
logging.info(f"Converted {len(page_images)} pages to images in {timer() - start:.2f}s")
|
||||
return page_images
|
||||
|
||||
def parse_pdf(self, pdf_source, zoomin=3, page_from=0, page_to=299, callback=None):
|
||||
"""
|
||||
解析PDF文档,使用OCR识别文本
|
||||
|
||||
Args:
|
||||
pdf_source: PDF文件路径(str)或二进制数据(bytes)
|
||||
zoomin: 放大倍数,默认3
|
||||
page_from: 起始页码(从0开始)
|
||||
page_to: 结束页码
|
||||
callback: 进度回调函数,格式: callback(prog: float, msg: str)
|
||||
|
||||
Returns:
|
||||
dict: 解析结果
|
||||
{
|
||||
"pages": [
|
||||
{
|
||||
"page_number": int,
|
||||
"boxes": [
|
||||
{
|
||||
"text": str,
|
||||
"bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
|
||||
"confidence": float
|
||||
},
|
||||
...
|
||||
]
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
if callback:
|
||||
callback(0.0, "Starting PDF parsing...")
|
||||
|
||||
# 1. 转换为图片
|
||||
if callback:
|
||||
callback(0.1, "Converting PDF to images...")
|
||||
page_images = self.__convert_pdf_to_images(pdf_source, zoomin, page_from, page_to)
|
||||
|
||||
if not page_images:
|
||||
logging.warning("No pages converted from PDF")
|
||||
return {"pages": []}
|
||||
|
||||
# 2. OCR处理
|
||||
async def process_all_pages():
|
||||
pages_result = []
|
||||
|
||||
if self.parallel_limiter:
|
||||
# 并行处理(多GPU)
|
||||
async with trio.open_nursery() as nursery:
|
||||
tasks = []
|
||||
for i, img in enumerate(page_images):
|
||||
page_num = page_from + i + 1
|
||||
device_id = i % PARALLEL_DEVICES
|
||||
task = nursery.start_soon(
|
||||
self.__ocr_page_async,
|
||||
page_num, img, zoomin, device_id,
|
||||
self.parallel_limiter[device_id], callback
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# 等待所有任务完成并收集结果
|
||||
for i, task in enumerate(tasks):
|
||||
result = await task
|
||||
pages_result.append({
|
||||
"page_number": page_from + i + 1,
|
||||
"boxes": result
|
||||
})
|
||||
else:
|
||||
# 串行处理(单GPU或CPU)
|
||||
for i, img in enumerate(page_images):
|
||||
page_num = page_from + i + 1
|
||||
result = await trio.to_thread.run_sync(
|
||||
lambda img=img, pn=page_num: self.__ocr_page(pn, img, zoomin, 0)
|
||||
)
|
||||
pages_result.append({
|
||||
"page_number": page_num,
|
||||
"boxes": result
|
||||
})
|
||||
if callback:
|
||||
callback(0.1 + (i + 1) * 0.9 / len(page_images), f"Processing page {page_num}...")
|
||||
|
||||
return pages_result
|
||||
|
||||
# 运行异步处理
|
||||
if callback:
|
||||
callback(0.2, "Starting OCR processing...")
|
||||
|
||||
start = timer()
|
||||
pages_result = trio.run(process_all_pages)
|
||||
logging.info(f"OCR processing completed in {timer() - start:.2f}s")
|
||||
|
||||
if callback:
|
||||
callback(1.0, "OCR processing completed")
|
||||
|
||||
return {
|
||||
"pages": pages_result
|
||||
}
|
||||
|
||||
|
||||
# 向后兼容的别名
|
||||
PdfParser = SimplePdfParser
|
||||
|
||||
Reference in New Issue
Block a user