将ocr解析模块独立出来

2025-10-31 14:38:37 +08:00
parent d78f1fe91d
commit 4318179904
13 changed files with 3262 additions and 23 deletions
--- a/ocr/pdf_parser.py
+++ b/ocr/pdf_parser.py
@@ -0,0 +1,339 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+简化的PDF解析器，只使用OCR处理PDF文档
+
+从 RAGFlow 的 RAGFlowPdfParser 中提取OCR相关功能，移除了：
+- 布局识别（Layout Recognition）
+- 表格结构识别（Table Structure Recognition）
+- 文本合并和语义分析
+- RAG相关功能
+
+只保留：
+- PDF转图片
+- OCR文本检测和识别
+- 基本的文本和位置信息返回
+"""
+
+import logging
+import sys
+import threading
+from io import BytesIO
+from pathlib import Path
+from timeit import default_timer as timer
+
+import numpy as np
+import pdfplumber
+import trio
+
+# 处理导入问题：支持直接运行和模块导入
+try:
+    _package = __package__
+except NameError:
+    _package = None
+
+if _package is None:
+    # 直接运行时，添加父目录到路径并使用绝对导入
+    parent_dir = Path(__file__).parent.parent
+    if str(parent_dir) not in sys.path:
+        sys.path.insert(0, str(parent_dir))
+    from ocr.config import PARALLEL_DEVICES
+    from ocr.ocr import OCR
+else:
+    # 作为模块导入时使用相对导入
+    from config import PARALLEL_DEVICES
+    from ocr import OCR
+
+LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
+if LOCK_KEY_pdfplumber not in sys.modules:
+    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
+
+
+class SimplePdfParser:
+    """
+    简化的PDF解析器，只使用OCR处理PDF
+    
+    使用方法:
+        parser = SimplePdfParser()
+        result = parser.parse_pdf("file.pdf")  # 或传入二进制数据
+        # result 格式:
+        # {
+        #     "pages": [
+        #         {
+        #             "page_number": 1,
+        #             "boxes": [
+        #                 {
+        #                     "text": "识别到的文本",
+        #                     "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
+        #                     "confidence": 0.95
+        #                 },
+        #                 ...
+        #             ]
+        #         },
+        #         ...
+        #     ]
+        # }
+    """
+    
+    def __init__(self, model_dir=None):
+        """
+        初始化PDF解析器
+        
+        Args:
+            model_dir: OCR模型目录，如果为None则使用默认路径
+        """
+        self.ocr = OCR(model_dir=model_dir)
+        self.parallel_limiter = None
+        if PARALLEL_DEVICES > 1:
+            self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)]
+
+    def __ocr_page(self, page_num, img, zoomin=3, device_id=None):
+        """
+        对单页进行OCR处理
+        
+        Args:
+            page_num: 页码
+            img: PIL Image对象
+            zoomin: 放大倍数（用于坐标缩放）
+            device_id: GPU设备ID
+            
+        Returns:
+            list: OCR结果列表，每个元素为 {"text": str, "bbox": list, "confidence": float}
+        """
+        start = timer()
+        img_np = np.array(img)
+        
+        # 文本检测
+        # detect方法返回: zip对象，格式为 (box_coords, (text, score))
+        # 但检测阶段text和score都是默认值，需要后续识别
+        detection_result = self.ocr.detect(img_np, device_id)
+        
+        if detection_result is None:
+            return []
+        
+        # 转换为列表并提取box坐标
+        # detect返回的格式是zip，每个元素是 (box_coords, (text, score))
+        # 在检测阶段，text是空字符串，score是0
+        bxs = list(detection_result)
+        
+        logging.info(f"Page {page_num}: OCR detection found {len(bxs)} boxes in {timer() - start:.2f}s")
+        
+        if not bxs:
+            return []
+        
+        # 解析检测结果并准备识别
+        boxes_to_reg = []
+        
+        start = timer()
+        for box_coords, _, _ in bxs:
+            # box_coords 是四边形坐标: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]]
+            # 转换为原始坐标（考虑zoomin）
+            box_coords_np = np.array(box_coords, dtype=np.float32)
+            original_coords = box_coords_np / zoomin  # 缩放回原始坐标
+            
+            # 裁剪图像用于识别
+            # 使用放大后的坐标裁剪（因为img_np是放大后的图像）
+            crop_box = box_coords_np
+            crop_img = self.ocr.get_rotate_crop_image(img_np, crop_box)
+            boxes_to_reg.append({
+                "bbox": original_coords.tolist(),
+                "crop_img": crop_img
+            })
+        
+        # 批量识别文本
+        ocr_results = []
+        if boxes_to_reg:
+            crop_imgs = [b["crop_img"] for b in boxes_to_reg]
+            texts = self.ocr.recognize_batch(crop_imgs, device_id)
+            
+            # 组装结果
+            for i, b in enumerate(boxes_to_reg):
+                if i < len(texts) and texts[i]:  # 过滤空文本
+                    ocr_results.append({
+                        "text": texts[i],
+                        "bbox": b["bbox"],
+                        "confidence": 0.9  # 简化版本，不计算具体置信度
+                    })
+        
+        logging.info(f"Page {page_num}: OCR recognition {len(ocr_results)} boxes cost {timer() - start:.2f}s")
+        return ocr_results
+
+    async def __ocr_page_async(self, page_num, img, zoomin, device_id, limiter, callback):
+        """
+        异步OCR处理单页
+        
+        Args:
+            page_num: 页码
+            img: PIL Image对象
+            zoomin: 放大倍数
+            device_id: GPU设备ID
+            limiter: 并发限制器
+            callback: 进度回调函数
+        """
+        if limiter:
+            async with limiter:
+                result = await trio.to_thread.run_sync(
+                    lambda: self.__ocr_page(page_num, img, zoomin, device_id)
+                )
+        else:
+            result = await trio.to_thread.run_sync(
+                lambda: self.__ocr_page(page_num, img, zoomin, device_id)
+            )
+        
+        if callback and page_num % 5 == 0:
+            callback(prog=page_num * 0.9 / 100, msg=f"Processing page {page_num}...")
+        
+        return result
+
+    def __convert_pdf_to_images(self, pdf_source, zoomin=3, page_from=0, page_to=299):
+        """
+        将PDF转换为图片
+        
+        Args:
+            pdf_source: PDF文件路径（str）或二进制数据（bytes）
+            zoomin: 放大倍数，默认3（72*3=216 DPI）
+            page_from: 起始页码（从0开始）
+            page_to: 结束页码
+            
+        Returns:
+            list: PIL Image对象列表
+        """
+        start = timer()
+        page_images = []
+        
+        try:
+            with sys.modules[LOCK_KEY_pdfplumber]:
+                pdf = pdfplumber.open(pdf_source) if isinstance(pdf_source, str) else pdfplumber.open(BytesIO(pdf_source))
+                try:
+                    # 转换为图片，resolution = 72 * zoomin
+                    page_images = [
+                        p.to_image(resolution=72 * zoomin, antialias=True).annotated
+                        for i, p in enumerate(pdf.pages[page_from:page_to])
+                    ]
+                    pdf.close()
+                except Exception as e:
+                    logging.warning(f"Failed to convert PDF pages {page_from}-{page_to}: {str(e)}")
+                    if hasattr(pdf, 'close'):
+                        pdf.close()
+        except Exception as e:
+            logging.exception(f"Error converting PDF to images: {str(e)}")
+        
+        logging.info(f"Converted {len(page_images)} pages to images in {timer() - start:.2f}s")
+        return page_images
+
+    def parse_pdf(self, pdf_source, zoomin=3, page_from=0, page_to=299, callback=None):
+        """
+        解析PDF文档，使用OCR识别文本
+        
+        Args:
+            pdf_source: PDF文件路径（str）或二进制数据（bytes）
+            zoomin: 放大倍数，默认3
+            page_from: 起始页码（从0开始）
+            page_to: 结束页码
+            callback: 进度回调函数，格式: callback(prog: float, msg: str)
+            
+        Returns:
+            dict: 解析结果
+            {
+                "pages": [
+                    {
+                        "page_number": int,
+                        "boxes": [
+                            {
+                                "text": str,
+                                "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
+                                "confidence": float
+                            },
+                            ...
+                        ]
+                    },
+                    ...
+                ]
+            }
+        """
+        if callback:
+            callback(0.0, "Starting PDF parsing...")
+        
+        # 1. 转换为图片
+        if callback:
+            callback(0.1, "Converting PDF to images...")
+        page_images = self.__convert_pdf_to_images(pdf_source, zoomin, page_from, page_to)
+        
+        if not page_images:
+            logging.warning("No pages converted from PDF")
+            return {"pages": []}
+        
+        # 2. OCR处理
+        async def process_all_pages():
+            pages_result = []
+            
+            if self.parallel_limiter:
+                # 并行处理（多GPU）
+                async with trio.open_nursery() as nursery:
+                    tasks = []
+                    for i, img in enumerate(page_images):
+                        page_num = page_from + i + 1
+                        device_id = i % PARALLEL_DEVICES
+                        task = nursery.start_soon(
+                            self.__ocr_page_async,
+                            page_num, img, zoomin, device_id,
+                            self.parallel_limiter[device_id], callback
+                        )
+                        tasks.append(task)
+                    
+                    # 等待所有任务完成并收集结果
+                    for i, task in enumerate(tasks):
+                        result = await task
+                        pages_result.append({
+                            "page_number": page_from + i + 1,
+                            "boxes": result
+                        })
+            else:
+                # 串行处理（单GPU或CPU）
+                for i, img in enumerate(page_images):
+                    page_num = page_from + i + 1
+                    result = await trio.to_thread.run_sync(
+                        lambda img=img, pn=page_num: self.__ocr_page(pn, img, zoomin, 0)
+                    )
+                    pages_result.append({
+                        "page_number": page_num,
+                        "boxes": result
+                    })
+                    if callback:
+                        callback(0.1 + (i + 1) * 0.9 / len(page_images), f"Processing page {page_num}...")
+            
+            return pages_result
+        
+        # 运行异步处理
+        if callback:
+            callback(0.2, "Starting OCR processing...")
+        
+        start = timer()
+        pages_result = trio.run(process_all_pages)
+        logging.info(f"OCR processing completed in {timer() - start:.2f}s")
+        
+        if callback:
+            callback(1.0, "OCR processing completed")
+        
+        return {
+            "pages": pages_result
+        }
+
+
+# 向后兼容的别名
+PdfParser = SimplePdfParser
+