# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """ 简化的PDF解析器,只使用OCR处理PDF文档 从 RAGFlow 的 RAGFlowPdfParser 中提取OCR相关功能,移除了: - 布局识别(Layout Recognition) - 表格结构识别(Table Structure Recognition) - 文本合并和语义分析 - RAG相关功能 只保留: - PDF转图片 - OCR文本检测和识别 - 基本的文本和位置信息返回 """ import logging import sys import threading from io import BytesIO from pathlib import Path from timeit import default_timer as timer import numpy as np import pdfplumber import trio # 处理导入问题:支持直接运行和模块导入 try: _package = __package__ except NameError: _package = None if _package is None: # 直接运行时,添加父目录到路径并使用绝对导入 parent_dir = Path(__file__).parent.parent if str(parent_dir) not in sys.path: sys.path.insert(0, str(parent_dir)) from ocr.config import PARALLEL_DEVICES from ocr.ocr import OCR else: # 作为模块导入时使用相对导入 from config import PARALLEL_DEVICES from ocr import OCR LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() class SimplePdfParser: """ 简化的PDF解析器,只使用OCR处理PDF 使用方法: parser = SimplePdfParser() result = parser.parse_pdf("file.pdf") # 或传入二进制数据 # result 格式: # { # "pages": [ # { # "page_number": 1, # "boxes": [ # { # "text": "识别到的文本", # "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], # "confidence": 0.95 # }, # ... # ] # }, # ... # ] # } """ def __init__(self, model_dir=None): """ 初始化PDF解析器 Args: model_dir: OCR模型目录,如果为None则使用默认路径 """ self.ocr = OCR(model_dir=model_dir) self.parallel_limiter = None if PARALLEL_DEVICES > 1: self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)] def __ocr_page(self, page_num, img, zoomin=3, device_id=None): """ 对单页进行OCR处理 Args: page_num: 页码 img: PIL Image对象 zoomin: 放大倍数(用于坐标缩放) device_id: GPU设备ID Returns: list: OCR结果列表,每个元素为 {"text": str, "bbox": list, "confidence": float} """ start = timer() img_np = np.array(img) # 文本检测 # detect方法返回: zip对象,格式为 (box_coords, (text, score)) # 但检测阶段text和score都是默认值,需要后续识别 detection_result = self.ocr.detect(img_np, device_id) if detection_result is None: return [] # 转换为列表并提取box坐标 # detect返回的格式是zip,每个元素是 (box_coords, (text, score)) # 在检测阶段,text是空字符串,score是0 bxs = list(detection_result) logging.info(f"Page {page_num}: OCR detection found {len(bxs)} boxes in {timer() - start:.2f}s") if not bxs: return [] # 解析检测结果并准备识别 boxes_to_reg = [] start = timer() for box_coords, _, _ in bxs: # box_coords 是四边形坐标: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]] # 转换为原始坐标(考虑zoomin) box_coords_np = np.array(box_coords, dtype=np.float32) original_coords = box_coords_np / zoomin # 缩放回原始坐标 # 裁剪图像用于识别 # 使用放大后的坐标裁剪(因为img_np是放大后的图像) crop_box = box_coords_np crop_img = self.ocr.get_rotate_crop_image(img_np, crop_box) boxes_to_reg.append({ "bbox": original_coords.tolist(), "crop_img": crop_img }) # 批量识别文本 ocr_results = [] if boxes_to_reg: crop_imgs = [b["crop_img"] for b in boxes_to_reg] texts = self.ocr.recognize_batch(crop_imgs, device_id) # 组装结果 for i, b in enumerate(boxes_to_reg): if i < len(texts) and texts[i]: # 过滤空文本 ocr_results.append({ "text": texts[i], "bbox": b["bbox"], "confidence": 0.9 # 简化版本,不计算具体置信度 }) logging.info(f"Page {page_num}: OCR recognition {len(ocr_results)} boxes cost {timer() - start:.2f}s") return ocr_results async def __ocr_page_async(self, page_num, img, zoomin, device_id, limiter, callback): """ 异步OCR处理单页 Args: page_num: 页码 img: PIL Image对象 zoomin: 放大倍数 device_id: GPU设备ID limiter: 并发限制器 callback: 进度回调函数 """ if limiter: async with limiter: result = await trio.to_thread.run_sync( lambda: self.__ocr_page(page_num, img, zoomin, device_id) ) else: result = await trio.to_thread.run_sync( lambda: self.__ocr_page(page_num, img, zoomin, device_id) ) if callback and page_num % 5 == 0: callback(prog=page_num * 0.9 / 100, msg=f"Processing page {page_num}...") return result def __convert_pdf_to_images(self, pdf_source, zoomin=3, page_from=0, page_to=299): """ 将PDF转换为图片 Args: pdf_source: PDF文件路径(str)或二进制数据(bytes) zoomin: 放大倍数,默认3(72*3=216 DPI) page_from: 起始页码(从0开始) page_to: 结束页码 Returns: list: PIL Image对象列表 """ start = timer() page_images = [] try: with sys.modules[LOCK_KEY_pdfplumber]: pdf = pdfplumber.open(pdf_source) if isinstance(pdf_source, str) else pdfplumber.open(BytesIO(pdf_source)) try: # 转换为图片,resolution = 72 * zoomin page_images = [ p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in enumerate(pdf.pages[page_from:page_to]) ] pdf.close() except Exception as e: logging.warning(f"Failed to convert PDF pages {page_from}-{page_to}: {str(e)}") if hasattr(pdf, 'close'): pdf.close() except Exception as e: logging.exception(f"Error converting PDF to images: {str(e)}") logging.info(f"Converted {len(page_images)} pages to images in {timer() - start:.2f}s") return page_images def parse_pdf(self, pdf_source, zoomin=3, page_from=0, page_to=299, callback=None): """ 解析PDF文档,使用OCR识别文本 Args: pdf_source: PDF文件路径(str)或二进制数据(bytes) zoomin: 放大倍数,默认3 page_from: 起始页码(从0开始) page_to: 结束页码 callback: 进度回调函数,格式: callback(prog: float, msg: str) Returns: dict: 解析结果 { "pages": [ { "page_number": int, "boxes": [ { "text": str, "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], "confidence": float }, ... ] }, ... ] } """ if callback: callback(0.0, "Starting PDF parsing...") # 1. 转换为图片 if callback: callback(0.1, "Converting PDF to images...") page_images = self.__convert_pdf_to_images(pdf_source, zoomin, page_from, page_to) if not page_images: logging.warning("No pages converted from PDF") return {"pages": []} # 2. OCR处理 async def process_all_pages(): pages_result = [] if self.parallel_limiter: # 并行处理(多GPU) async with trio.open_nursery() as nursery: tasks = [] for i, img in enumerate(page_images): page_num = page_from + i + 1 device_id = i % PARALLEL_DEVICES task = nursery.start_soon( self.__ocr_page_async, page_num, img, zoomin, device_id, self.parallel_limiter[device_id], callback ) tasks.append(task) # 等待所有任务完成并收集结果 for i, task in enumerate(tasks): result = await task pages_result.append({ "page_number": page_from + i + 1, "boxes": result }) else: # 串行处理(单GPU或CPU) for i, img in enumerate(page_images): page_num = page_from + i + 1 result = await trio.to_thread.run_sync( lambda img=img, pn=page_num: self.__ocr_page(pn, img, zoomin, 0) ) pages_result.append({ "page_number": page_num, "boxes": result }) if callback: callback(0.1 + (i + 1) * 0.9 / len(page_images), f"Processing page {page_num}...") return pages_result # 运行异步处理 if callback: callback(0.2, "Starting OCR processing...") start = timer() pages_result = trio.run(process_all_pages) logging.info(f"OCR processing completed in {timer() - start:.2f}s") if callback: callback(1.0, "OCR processing completed") return { "pages": pages_result } # 向后兼容的别名 PdfParser = SimplePdfParser