TERES_fastapi_backend/ocr/pdf_parser.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

"""
简化的PDF解析器，只使用OCR处理PDF文档

从 RAGFlow 的 RAGFlowPdfParser 中提取OCR相关功能，移除了：
- 布局识别（Layout Recognition）
- 表格结构识别（Table Structure Recognition）
- 文本合并和语义分析
- RAG相关功能

只保留：
- PDF转图片
- OCR文本检测和识别
- 基本的文本和位置信息返回
"""

import logging
import sys
import threading
from io import BytesIO
from pathlib import Path
from timeit import default_timer as timer

import numpy as np
import pdfplumber
import trio

# 处理导入问题：支持直接运行和模块导入
try:
    _package = __package__
except NameError:
    _package = None

if _package is None:
    # 直接运行时，添加父目录到路径并使用绝对导入
    parent_dir = Path(__file__).parent.parent
    if str(parent_dir) not in sys.path:
        sys.path.insert(0, str(parent_dir))
    from ocr.config import PARALLEL_DEVICES
    from ocr.ocr import OCR
else:
    # 作为模块导入时使用相对导入
    from config import PARALLEL_DEVICES
    from ocr import OCR

LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()


class SimplePdfParser:
    """
    简化的PDF解析器，只使用OCR处理PDF
    
    使用方法:
        parser = SimplePdfParser()
        result = parser.parse_pdf("file.pdf")  # 或传入二进制数据
        # result 格式:
        # {
        #     "pages": [
        #         {
        #             "page_number": 1,
        #             "boxes": [
        #                 {
        #                     "text": "识别到的文本",
        #                     "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
        #                     "confidence": 0.95
        #                 },
        #                 ...
        #             ]
        #         },
        #         ...
        #     ]
        # }
    """
    
    def __init__(self, model_dir=None):
        """
        初始化PDF解析器
        
        Args:
            model_dir: OCR模型目录，如果为None则使用默认路径
        """
        self.ocr = OCR(model_dir=model_dir)
        self.parallel_limiter = None
        if PARALLEL_DEVICES > 1:
            self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)]

    def __ocr_page(self, page_num, img, zoomin=3, device_id=None):
        """
        对单页进行OCR处理
        
        Args:
            page_num: 页码
            img: PIL Image对象
            zoomin: 放大倍数（用于坐标缩放）
            device_id: GPU设备ID
            
        Returns:
            list: OCR结果列表，每个元素为 {"text": str, "bbox": list, "confidence": float}
        """
        start = timer()
        img_np = np.array(img)
        
        # 文本检测
        # detect方法返回: zip对象，格式为 (box_coords, (text, score))
        # 但检测阶段text和score都是默认值，需要后续识别
        detection_result = self.ocr.detect(img_np, device_id)
        
        if detection_result is None:
            return []
        
        # 转换为列表并提取box坐标
        # detect返回的格式是zip，每个元素是 (box_coords, (text, score))
        # 在检测阶段，text是空字符串，score是0
        bxs = list(detection_result)
        
        logging.info(f"Page {page_num}: OCR detection found {len(bxs)} boxes in {timer() - start:.2f}s")
        
        if not bxs:
            return []
        
        # 解析检测结果并准备识别
        boxes_to_reg = []
        
        start = timer()
        for box_coords, _, _ in bxs:
            # box_coords 是四边形坐标: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]]
            # 转换为原始坐标（考虑zoomin）
            box_coords_np = np.array(box_coords, dtype=np.float32)
            original_coords = box_coords_np / zoomin  # 缩放回原始坐标
            
            # 裁剪图像用于识别
            # 使用放大后的坐标裁剪（因为img_np是放大后的图像）
            crop_box = box_coords_np
            crop_img = self.ocr.get_rotate_crop_image(img_np, crop_box)
            boxes_to_reg.append({
                "bbox": original_coords.tolist(),
                "crop_img": crop_img
            })
        
        # 批量识别文本
        ocr_results = []
        if boxes_to_reg:
            crop_imgs = [b["crop_img"] for b in boxes_to_reg]
            texts = self.ocr.recognize_batch(crop_imgs, device_id)
            
            # 组装结果
            for i, b in enumerate(boxes_to_reg):
                if i < len(texts) and texts[i]:  # 过滤空文本
                    ocr_results.append({
                        "text": texts[i],
                        "bbox": b["bbox"],
                        "confidence": 0.9  # 简化版本，不计算具体置信度
                    })
        
        logging.info(f"Page {page_num}: OCR recognition {len(ocr_results)} boxes cost {timer() - start:.2f}s")
        return ocr_results

    async def __ocr_page_async(self, page_num, img, zoomin, device_id, limiter, callback):
        """
        异步OCR处理单页
        
        Args:
            page_num: 页码
            img: PIL Image对象
            zoomin: 放大倍数
            device_id: GPU设备ID
            limiter: 并发限制器
            callback: 进度回调函数
        """
        if limiter:
            async with limiter:
                result = await trio.to_thread.run_sync(
                    lambda: self.__ocr_page(page_num, img, zoomin, device_id)
                )
        else:
            result = await trio.to_thread.run_sync(
                lambda: self.__ocr_page(page_num, img, zoomin, device_id)
            )
        
        if callback and page_num % 5 == 0:
            callback(prog=page_num * 0.9 / 100, msg=f"Processing page {page_num}...")
        
        return result

    def __convert_pdf_to_images(self, pdf_source, zoomin=3, page_from=0, page_to=299):
        """
        将PDF转换为图片
        
        Args:
            pdf_source: PDF文件路径（str）或二进制数据（bytes）
            zoomin: 放大倍数，默认3（72*3=216 DPI）
            page_from: 起始页码（从0开始）
            page_to: 结束页码
            
        Returns:
            list: PIL Image对象列表
        """
        start = timer()
        page_images = []
        
        try:
            with sys.modules[LOCK_KEY_pdfplumber]:
                pdf = pdfplumber.open(pdf_source) if isinstance(pdf_source, str) else pdfplumber.open(BytesIO(pdf_source))
                try:
                    # 转换为图片，resolution = 72 * zoomin
                    page_images = [
                        p.to_image(resolution=72 * zoomin, antialias=True).annotated
                        for i, p in enumerate(pdf.pages[page_from:page_to])
                    ]
                    pdf.close()
                except Exception as e:
                    logging.warning(f"Failed to convert PDF pages {page_from}-{page_to}: {str(e)}")
                    if hasattr(pdf, 'close'):
                        pdf.close()
        except Exception as e:
            logging.exception(f"Error converting PDF to images: {str(e)}")
        
        logging.info(f"Converted {len(page_images)} pages to images in {timer() - start:.2f}s")
        return page_images

    def parse_pdf(self, pdf_source, zoomin=3, page_from=0, page_to=299, callback=None):
        """
        解析PDF文档，使用OCR识别文本
        
        Args:
            pdf_source: PDF文件路径（str）或二进制数据（bytes）
            zoomin: 放大倍数，默认3
            page_from: 起始页码（从0开始）
            page_to: 结束页码
            callback: 进度回调函数，格式: callback(prog: float, msg: str)
            
        Returns:
            dict: 解析结果
            {
                "pages": [
                    {
                        "page_number": int,
                        "boxes": [
                            {
                                "text": str,
                                "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
                                "confidence": float
                            },
                            ...
                        ]
                    },
                    ...
                ]
            }
        """
        if callback:
            callback(0.0, "Starting PDF parsing...")
        
        # 1. 转换为图片
        if callback:
            callback(0.1, "Converting PDF to images...")
        page_images = self.__convert_pdf_to_images(pdf_source, zoomin, page_from, page_to)
        
        if not page_images:
            logging.warning("No pages converted from PDF")
            return {"pages": []}
        
        # 2. OCR处理
        async def process_all_pages():
            pages_result = []
            
            if self.parallel_limiter:
                # 并行处理（多GPU）
                async with trio.open_nursery() as nursery:
                    tasks = []
                    for i, img in enumerate(page_images):
                        page_num = page_from + i + 1
                        device_id = i % PARALLEL_DEVICES
                        task = nursery.start_soon(
                            self.__ocr_page_async,
                            page_num, img, zoomin, device_id,
                            self.parallel_limiter[device_id], callback
                        )
                        tasks.append(task)
                    
                    # 等待所有任务完成并收集结果
                    for i, task in enumerate(tasks):
                        result = await task
                        pages_result.append({
                            "page_number": page_from + i + 1,
                            "boxes": result
                        })
            else:
                # 串行处理（单GPU或CPU）
                for i, img in enumerate(page_images):
                    page_num = page_from + i + 1
                    result = await trio.to_thread.run_sync(
                        lambda img=img, pn=page_num: self.__ocr_page(pn, img, zoomin, 0)
                    )
                    pages_result.append({
                        "page_number": page_num,
                        "boxes": result
                    })
                    if callback:
                        callback(0.1 + (i + 1) * 0.9 / len(page_images), f"Processing page {page_num}...")
            
            return pages_result
        
        # 运行异步处理
        if callback:
            callback(0.2, "Starting OCR processing...")
        
        start = timer()
        pages_result = trio.run(process_all_pages)
        logging.info(f"OCR processing completed in {timer() - start:.2f}s")
        
        if callback:
            callback(1.0, "OCR processing completed")
        
        return {
            "pages": pages_result
        }


# 向后兼容的别名
PdfParser = SimplePdfParser