diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 4853b91..5a8ea14 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1,1434 +1,24 @@ -# -# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import logging -import os -import random -import re -import sys -import threading -from copy import deepcopy -from io import BytesIO -from timeit import default_timer as timer - -import numpy as np -import pdfplumber -import trio -import xgboost as xgb -from huggingface_hub import snapshot_download -from PIL import Image -from pypdf import PdfReader as pdf2_read - -from api import settings -from api.utils.file_utils import get_project_base_directory -from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer -from deepdoc.parser.ocr_http_client import OCRHttpClient -from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk -from rag.nlp import rag_tokenizer -from rag.prompts.generator import vision_llm_describe_prompt -from rag.settings import PARALLEL_DEVICES - -LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" -if LOCK_KEY_pdfplumber not in sys.modules: - sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() - - -class RAGFlowPdfParser: - def __init__(self, **kwargs): - """ - If you have trouble downloading HuggingFace models, -_^ this might help!! - - For Linux: - export HF_ENDPOINT=https://hf-mirror.com - - For Windows: - Good luck - ^_- - - """ - - # 检查是否使用 HTTP OCR 服务 - use_http_ocr = os.getenv("USE_OCR_HTTP", "false").lower() in ("true", "1", "yes") - ocr_service_url = os.getenv("OCR_SERVICE_URL", "http://localhost:8000") - - if use_http_ocr: - logging.info(f"Using HTTP OCR service: {ocr_service_url}") - self.ocr = None # 不使用本地 OCR - self.ocr_http_client = OCRHttpClient(base_url=ocr_service_url) - self.use_http_ocr = True - else: - logging.info("Using local OCR") - self.ocr = OCR() - self.ocr_http_client = None - self.use_http_ocr = False - - self.parallel_limiter = None - if not self.use_http_ocr and PARALLEL_DEVICES > 1: - self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)] - - layout_recognizer_type = os.getenv("LAYOUT_RECOGNIZER_TYPE", "onnx").lower() - if layout_recognizer_type not in ["onnx", "ascend"]: - raise RuntimeError("Unsupported layout recognizer type.") - - if hasattr(self, "model_speciess"): - recognizer_domain = "layout." + self.model_speciess - else: - recognizer_domain = "layout" - - if layout_recognizer_type == "ascend": - logging.debug("Using Ascend LayoutRecognizer") - self.layouter = AscendLayoutRecognizer(recognizer_domain) - else: # onnx - logging.debug("Using Onnx LayoutRecognizer") - self.layouter = LayoutRecognizer(recognizer_domain) - self.tbl_det = TableStructureRecognizer() - - self.updown_cnt_mdl = xgb.Booster() - if not settings.LIGHTEN: - try: - import torch.cuda - - if torch.cuda.is_available(): - self.updown_cnt_mdl.set_param({"device": "cuda"}) - except Exception: - logging.exception("RAGFlowPdfParser __init__") - try: - model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc") - self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model")) - except Exception: - model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), local_dir_use_symlinks=False) - self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model")) - - self.page_from = 0 - self.column_num = 1 - - def __char_width(self, c): - return (c["x1"] - c["x0"]) // max(len(c["text"]), 1) - - def __height(self, c): - return c["bottom"] - c["top"] - - def _x_dis(self, a, b): - return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]), abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2) - - def _y_dis(self, a, b): - return (b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2 - - def _match_proj(self, b): - proj_patt = [ - r"第[零一二三四五六七八九十百]+章", - r"第[零一二三四五六七八九十百]+[条节]", - r"[零一二三四五六七八九十百]+[、是  ]", - r"[\((][零一二三四五六七八九十百]+[)\)]", - r"[\((][0-9]+[)\)]", - r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})", - r"[0-9]+\.[0-9.]+(、|\.[  ])", - r"[⚫•➢①② ]", - ] - return any([re.match(p, b["text"]) for p in proj_patt]) - - def _updown_concat_features(self, up, down): - w = max(self.__char_width(up), self.__char_width(down)) - h = max(self.__height(up), self.__height(down)) - y_dis = self._y_dis(up, down) - LEN = 6 - tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split() - tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split() - tks_all = up["text"][-LEN:].strip() + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") + down["text"][:LEN].strip() - tks_all = rag_tokenizer.tokenize(tks_all).split() - fea = [ - up.get("R", -1) == down.get("R", -1), - y_dis / h, - down["page_number"] - up["page_number"], - up["layout_type"] == down["layout_type"], - up["layout_type"] == "text", - down["layout_type"] == "text", - up["layout_type"] == "table", - down["layout_type"] == "table", - True if re.search(r"([。?!;!?;+))]|[a-z]\.)$", up["text"]) else False, - True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False, - True if re.search(r"(^.?[/,?;:\],。;:’”?!》】)-])", down["text"]) else False, - True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False, - True if re.search(r"[,,][^。.]+$", up["text"]) else False, - True if re.search(r"[,,][^。.]+$", up["text"]) else False, - True if re.search(r"[\((][^\))]+$", up["text"]) and re.search(r"[\))]", down["text"]) else False, - self._match_proj(down), - True if re.match(r"[A-Z]", down["text"]) else False, - True if re.match(r"[A-Z]", up["text"][-1]) else False, - True if re.match(r"[a-z0-9]", up["text"][-1]) else False, - True if re.match(r"[0-9.%,-]+$", down["text"]) else False, - up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()) > 1 and len(down["text"].strip()) > 1 else False, - up["x0"] > down["x1"], - abs(self.__height(up) - self.__height(down)) / min(self.__height(up), self.__height(down)), - self._x_dis(up, down) / max(w, 0.000001), - (len(up["text"]) - len(down["text"])) / max(len(up["text"]), len(down["text"])), - len(tks_all) - len(tks_up) - len(tks_down), - len(tks_down) - len(tks_up), - tks_down[-1] == tks_up[-1] if tks_down and tks_up else False, - max(down["in_row"], up["in_row"]), - abs(down["in_row"] - up["in_row"]), - len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0, - len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0, - ] - return fea - - @staticmethod - def sort_X_by_page(arr, threshold): - # sort using y1 first and then x1 - arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) - for i in range(len(arr) - 1): - for j in range(i, -1, -1): - # restore the order using th - if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold and arr[j + 1]["top"] < arr[j]["top"] and arr[j + 1]["page_number"] == arr[j]["page_number"]: - tmp = arr[j] - arr[j] = arr[j + 1] - arr[j + 1] = tmp - return arr - - def _has_color(self, o): - if o.get("ncs", "") == "DeviceGray": - if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and o["non_stroking_color"][0] == 1: - if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")): - return False - return True - - def _table_transformer_job(self, ZM): - logging.debug("Table processing...") - imgs, pos = [], [] - tbcnt = [0] - MARGIN = 10 - self.tb_cpns = [] - assert len(self.page_layout) == len(self.page_images) - for p, tbls in enumerate(self.page_layout): # for page - tbls = [f for f in tbls if f["type"] == "table"] - tbcnt.append(len(tbls)) - if not tbls: - continue - for tb in tbls: # for table - left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, tb["x1"] + MARGIN, tb["bottom"] + MARGIN - left *= ZM - top *= ZM - right *= ZM - bott *= ZM - pos.append((left, top)) - imgs.append(self.page_images[p].crop((left, top, right, bott))) - - assert len(self.page_images) == len(tbcnt) - 1 - if not imgs: - return - recos = self.tbl_det(imgs) - tbcnt = np.cumsum(tbcnt) - for i in range(len(tbcnt) - 1): # for page - pg = [] - for j, tb_items in enumerate(recos[tbcnt[i] : tbcnt[i + 1]]): # for table - poss = pos[tbcnt[i] : tbcnt[i + 1]] - for it in tb_items: # for table components - it["x0"] = it["x0"] + poss[j][0] - it["x1"] = it["x1"] + poss[j][0] - it["top"] = it["top"] + poss[j][1] - it["bottom"] = it["bottom"] + poss[j][1] - for n in ["x0", "x1", "top", "bottom"]: - it[n] /= ZM - it["top"] += self.page_cum_height[i] - it["bottom"] += self.page_cum_height[i] - it["pn"] = i - it["layoutno"] = j - pg.append(it) - self.tb_cpns.extend(pg) - - def gather(kwd, fzy=10, ption=0.6): - eles = Recognizer.sort_Y_firstly([r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy) - eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption) - return Recognizer.sort_Y_firstly(eles, 0) - - # add R,H,C,SP tag to boxes within table layout - headers = gather(r".*header$") - rows = gather(r".* (row|header)") - spans = gather(r".*spanning") - clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"])) - clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5) - for b in self.boxes: - if b.get("layout_type", "") != "table": - continue - ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3) - if ii is not None: - b["R"] = ii - b["R_top"] = rows[ii]["top"] - b["R_bott"] = rows[ii]["bottom"] - - ii = Recognizer.find_overlapped_with_threshold(b, headers, thr=0.3) - if ii is not None: - b["H_top"] = headers[ii]["top"] - b["H_bott"] = headers[ii]["bottom"] - b["H_left"] = headers[ii]["x0"] - b["H_right"] = headers[ii]["x1"] - b["H"] = ii - - ii = Recognizer.find_horizontally_tightest_fit(b, clmns) - if ii is not None: - b["C"] = ii - b["C_left"] = clmns[ii]["x0"] - b["C_right"] = clmns[ii]["x1"] - - ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3) - if ii is not None: - b["H_top"] = spans[ii]["top"] - b["H_bott"] = spans[ii]["bottom"] - b["H_left"] = spans[ii]["x0"] - b["H_right"] = spans[ii]["x1"] - b["SP"] = ii - - def _convert_http_ocr_result(self, ocr_result: dict, zoomin: int = 3): - """ - 将 HTTP OCR API 返回的结果转换为 RAGFlow 内部格式 - - Args: - ocr_result: HTTP API 返回的结果,格式: - { - "success": bool, - "data": { - "pages": [ - { - "page_number": int, - "boxes": [ - { - "text": str, - "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], - "confidence": float - } - ] - } - ] - } - } - zoomin: 放大倍数 - """ - if not ocr_result.get("success", False) or "data" not in ocr_result: - logging.warning("Invalid OCR HTTP result") - return - - pages_data = ocr_result["data"].get("pages", []) - self.boxes = [] - - for page_data in pages_data: - page_num = page_data.get("page_number", 0) # HTTP API 返回的页码(从1开始) - boxes = page_data.get("boxes", []) - - # 转换为 RAGFlow 格式的 boxes - ragflow_boxes = [] - # 计算在 page_chars 中的索引:HTTP API 返回的页码是从1开始的,需要转换为相对于 page_from 的索引 - page_index = page_num - (self.page_from + 1) # page_from 是从0开始,所以需要 +1 - chars_for_page = self.page_chars[page_index] if hasattr(self, 'page_chars') and 0 <= page_index < len(self.page_chars) else [] - - for box in boxes: - bbox = box.get("bbox", []) - if len(bbox) != 4: - continue - - # 从 bbox 提取坐标(bbox 格式: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]]) - x0 = min(bbox[0][0], bbox[3][0]) / zoomin - x1 = max(bbox[1][0], bbox[2][0]) / zoomin - top = min(bbox[0][1], bbox[1][1]) / zoomin - bottom = max(bbox[2][1], bbox[3][1]) / zoomin - - # 创建 RAGFlow 格式的 box - ragflow_box = { - "x0": x0, - "x1": x1, - "top": top, - "bottom": bottom, - "text": box.get("text", ""), - "page_number": page_num, - "layoutno": "", - "layout_type": "" - } - - ragflow_boxes.append(ragflow_box) - - # 计算 mean_height - if ragflow_boxes: - heights = [b["bottom"] - b["top"] for b in ragflow_boxes] - self.mean_height.append(np.median(heights) if heights else 0) - else: - self.mean_height.append(0) - - # 计算 mean_width - if chars_for_page: - widths = [c.get("width", 8) for c in chars_for_page] - self.mean_width.append(np.median(widths) if widths else 8) - else: - self.mean_width.append(8) - - self.boxes.append(ragflow_boxes) - - logging.info(f"Converted {len(pages_data)} pages from HTTP OCR result") - - def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None): - # 如果使用 HTTP OCR,这个方法不会被调用 - if self.use_http_ocr: - logging.warning("__ocr called when using HTTP OCR, this should not happen") - return - - start = timer() - bxs = self.ocr.detect(np.array(img), device_id) - logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)") - - start = timer() - if not bxs: - self.boxes.append([]) - return - bxs = [(line[0], line[1][0]) for line in bxs] - bxs = Recognizer.sort_Y_firstly( - [ - {"x0": b[0][0] / ZM, "x1": b[1][0] / ZM, "top": b[0][1] / ZM, "text": "", "txt": t, "bottom": b[-1][1] / ZM, "chars": [], "page_number": pagenum} - for b, t in bxs - if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1] - ], - self.mean_height[pagenum - 1] / 3, - ) - - # merge chars in the same rect - for c in chars: - ii = Recognizer.find_overlapped(c, bxs) - if ii is None: - self.lefted_chars.append(c) - continue - ch = c["bottom"] - c["top"] - bh = bxs[ii]["bottom"] - bxs[ii]["top"] - if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != " ": - self.lefted_chars.append(c) - continue - bxs[ii]["chars"].append(c) - - for b in bxs: - if not b["chars"]: - del b["chars"] - continue - m_ht = np.mean([c["height"] for c in b["chars"]]) - for c in Recognizer.sort_Y_firstly(b["chars"], m_ht): - if c["text"] == " " and b["text"]: - if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]): - b["text"] += " " - else: - b["text"] += c["text"] - del b["chars"] - - logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s") - start = timer() - boxes_to_reg = [] - img_np = np.array(img) - for b in bxs: - if not b["text"]: - left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM - b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32)) - boxes_to_reg.append(b) - del b["txt"] - texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id) - for i in range(len(boxes_to_reg)): - boxes_to_reg[i]["text"] = texts[i] - del boxes_to_reg[i]["box_image"] - logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s") - bxs = [b for b in bxs if b["text"]] - if self.mean_height[pagenum - 1] == 0: - self.mean_height[pagenum - 1] = np.median([b["bottom"] - b["top"] for b in bxs]) - self.boxes.append(bxs) - - def _layouts_rec(self, ZM, drop=True): - assert len(self.page_images) == len(self.boxes) - self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop) - # cumlative Y - for i in range(len(self.boxes)): - self.boxes[i]["top"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] - self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] - - def _text_merge(self): - # merge adjusted boxes - bxs = self.boxes - - def end_with(b, txt): - txt = txt.strip() - tt = b.get("text", "").strip() - return tt and tt.find(txt) == len(tt) - len(txt) - - def start_with(b, txts): - tt = b.get("text", "").strip() - return tt and any([tt.find(t.strip()) == 0 for t in txts]) - - # horizontally merge adjacent box with the same layout - i = 0 - while i < len(bxs) - 1: - b = bxs[i] - b_ = bxs[i + 1] - if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: - i += 1 - continue - if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3: - # merge - bxs[i]["x1"] = b_["x1"] - bxs[i]["top"] = (b["top"] + b_["top"]) / 2 - bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 - bxs[i]["text"] += b_["text"] - bxs.pop(i + 1) - continue - i += 1 - continue - - dis_thr = 1 - dis = b["x1"] - b_["x0"] - if b.get("layout_type", "") != "text" or b_.get("layout_type", "") != "text": - if end_with(b, ",") or start_with(b_, "(,"): - dis_thr = -8 - else: - i += 1 - continue - - if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 and dis >= dis_thr and b["x1"] < b_["x1"]: - # merge - bxs[i]["x1"] = b_["x1"] - bxs[i]["top"] = (b["top"] + b_["top"]) / 2 - bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 - bxs[i]["text"] += b_["text"] - bxs.pop(i + 1) - continue - i += 1 - self.boxes = bxs - - def _naive_vertical_merge(self, zoomin=3): - import math - bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) - - column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) - if not column_width or math.isnan(column_width): - column_width = self.mean_width[0] - self.column_num = int(self.page_images[0].size[0] / zoomin / column_width) - if column_width < self.page_images[0].size[0] / zoomin / self.column_num: - logging.info("Multi-column................... {} {}".format(column_width, self.page_images[0].size[0] / zoomin / self.column_num)) - self.boxes = self.sort_X_by_page(self.boxes, column_width / self.column_num) - - i = 0 - while i + 1 < len(bxs): - b = bxs[i] - b_ = bxs[i + 1] - if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): - bxs.pop(i) - continue - if not b["text"].strip(): - bxs.pop(i) - continue - concatting_feats = [ - b["text"].strip()[-1] in ",;:'\",、‘“;:-", - len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:", - b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:", - ] - # features for not concating - feats = [ - b.get("layoutno", 0) != b_.get("layoutno", 0), - b["text"].strip()[-1] in "。?!?", - self.is_english and b["text"].strip()[-1] in ".!?", - b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, - b["page_number"] < b_["page_number"] and abs(b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4, - ] - # split features - detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]] - if (any(feats) and not any(concatting_feats)) or any(detach_feats): - logging.debug( - "{} {} {} {}".format( - b["text"], - b_["text"], - any(feats), - any(concatting_feats), - ) - ) - i += 1 - continue - # merge up and down - b["bottom"] = b_["bottom"] - b["text"] += b_["text"] - b["x0"] = min(b["x0"], b_["x0"]) - b["x1"] = max(b["x1"], b_["x1"]) - bxs.pop(i + 1) - self.boxes = bxs - - def _concat_downward(self, concat_between_pages=True): - self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0) - return - - # count boxes in the same row as a feature - for i in range(len(self.boxes)): - mh = self.mean_height[self.boxes[i]["page_number"] - 1] - self.boxes[i]["in_row"] = 0 - j = max(0, i - 12) - while j < min(i + 12, len(self.boxes)): - if j == i: - j += 1 - continue - ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh - if abs(ydis) < 1: - self.boxes[i]["in_row"] += 1 - elif ydis > 0: - break - j += 1 - - # concat between rows - boxes = deepcopy(self.boxes) - blocks = [] - while boxes: - chunks = [] - - def dfs(up, dp): - chunks.append(up) - i = dp - while i < min(dp + 12, len(boxes)): - ydis = self._y_dis(up, boxes[i]) - smpg = up["page_number"] == boxes[i]["page_number"] - mh = self.mean_height[up["page_number"] - 1] - mw = self.mean_width[up["page_number"] - 1] - if smpg and ydis > mh * 4: - break - if not smpg and ydis > mh * 16: - break - down = boxes[i] - if not concat_between_pages and down["page_number"] > up["page_number"]: - break - - if up.get("R", "") != down.get("R", "") and up["text"][-1] != ",": - i += 1 - continue - - if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) or not down["text"].strip(): - i += 1 - continue - - if not down["text"].strip() or not up["text"].strip(): - i += 1 - continue - - if up["x1"] < down["x0"] - 10 * mw or up["x0"] > down["x1"] + 10 * mw: - i += 1 - continue - - if i - dp < 5 and up.get("layout_type") == "text": - if up.get("layoutno", "1") == down.get("layoutno", "2"): - dfs(down, i + 1) - boxes.pop(i) - return - i += 1 - continue - - fea = self._updown_concat_features(up, down) - if self.updown_cnt_mdl.predict(xgb.DMatrix([fea]))[0] <= 0.5: - i += 1 - continue - dfs(down, i + 1) - boxes.pop(i) - return - - dfs(boxes[0], 1) - boxes.pop(0) - if chunks: - blocks.append(chunks) - - # concat within each block - boxes = [] - for b in blocks: - if len(b) == 1: - boxes.append(b[0]) - continue - t = b[0] - for c in b[1:]: - t["text"] = t["text"].strip() - c["text"] = c["text"].strip() - if not c["text"]: - continue - if t["text"] and re.match(r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]): - t["text"] += " " - t["text"] += c["text"] - t["x0"] = min(t["x0"], c["x0"]) - t["x1"] = max(t["x1"], c["x1"]) - t["page_number"] = min(t["page_number"], c["page_number"]) - t["bottom"] = c["bottom"] - if not t["layout_type"] and c["layout_type"]: - t["layout_type"] = c["layout_type"] - boxes.append(t) - - self.boxes = Recognizer.sort_Y_firstly(boxes, 0) - - def _filter_forpages(self): - if not self.boxes: - return - findit = False - i = 0 - while i < len(self.boxes): - if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): - i += 1 - continue - findit = True - eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) - self.boxes.pop(i) - if i >= len(self.boxes): - break - prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2]) - while not prefix: - self.boxes.pop(i) - if i >= len(self.boxes): - break - prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2]) - self.boxes.pop(i) - if i >= len(self.boxes) or not prefix: - break - for j in range(i, min(i + 128, len(self.boxes))): - if not re.match(prefix, self.boxes[j]["text"]): - continue - for k in range(i, j): - self.boxes.pop(i) - break - if findit: - return - - page_dirty = [0] * len(self.page_images) - for b in self.boxes: - if re.search(r"(··|··|··)", b["text"]): - page_dirty[b["page_number"] - 1] += 1 - page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3]) - if not page_dirty: - return - i = 0 - while i < len(self.boxes): - if self.boxes[i]["page_number"] in page_dirty: - self.boxes.pop(i) - continue - i += 1 - - def _merge_with_same_bullet(self): - i = 0 - while i + 1 < len(self.boxes): - b = self.boxes[i] - b_ = self.boxes[i + 1] - if not b["text"].strip(): - self.boxes.pop(i) - continue - if not b_["text"].strip(): - self.boxes.pop(i + 1) - continue - - if ( - b["text"].strip()[0] != b_["text"].strip()[0] - or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") - or rag_tokenizer.is_chinese(b["text"].strip()[0]) - or b["top"] > b_["bottom"] - ): - i += 1 - continue - b_["text"] = b["text"] + "\n" + b_["text"] - b_["x0"] = min(b["x0"], b_["x0"]) - b_["x1"] = max(b["x1"], b_["x1"]) - b_["top"] = b["top"] - self.boxes.pop(i) - - def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False): - tables = {} - figures = {} - # extract figure and table boxes - i = 0 - lst_lout_no = "" - nomerge_lout_no = [] - while i < len(self.boxes): - if "layoutno" not in self.boxes[i]: - i += 1 - continue - lout_no = str(self.boxes[i]["page_number"]) + "-" + str(self.boxes[i]["layoutno"]) - if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title", "figure caption", "reference"]: - nomerge_lout_no.append(lst_lout_no) - if self.boxes[i]["layout_type"] == "table": - if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): - self.boxes.pop(i) - continue - if lout_no not in tables: - tables[lout_no] = [] - tables[lout_no].append(self.boxes[i]) - self.boxes.pop(i) - lst_lout_no = lout_no - continue - if need_image and self.boxes[i]["layout_type"] == "figure": - if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): - self.boxes.pop(i) - continue - if lout_no not in figures: - figures[lout_no] = [] - figures[lout_no].append(self.boxes[i]) - self.boxes.pop(i) - lst_lout_no = lout_no - continue - i += 1 - - # merge table on different pages - nomerge_lout_no = set(nomerge_lout_no) - tbls = sorted([(k, bxs) for k, bxs in tables.items()], key=lambda x: (x[1][0]["top"], x[1][0]["x0"])) - - i = len(tbls) - 1 - while i - 1 >= 0: - k0, bxs0 = tbls[i - 1] - k, bxs = tbls[i] - i -= 1 - if k0 in nomerge_lout_no: - continue - if bxs[0]["page_number"] == bxs0[0]["page_number"]: - continue - if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1: - continue - mh = self.mean_height[bxs[0]["page_number"] - 1] - if self._y_dis(bxs0[-1], bxs[0]) > mh * 23: - continue - tables[k0].extend(tables[k]) - del tables[k] - - def x_overlapped(a, b): - return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]]) - - # find captions and pop out - i = 0 - while i < len(self.boxes): - c = self.boxes[i] - # mh = self.mean_height[c["page_number"]-1] - if not TableStructureRecognizer.is_caption(c): - i += 1 - continue - - # find the nearest layouts - def nearest(tbls): - nonlocal c - mink = "" - minv = 1000000000 - for k, bxs in tbls.items(): - for b in bxs: - if b.get("layout_type", "").find("caption") >= 0: - continue - y_dis = self._y_dis(c, b) - x_dis = self._x_dis(c, b) if not x_overlapped(c, b) else 0 - dis = y_dis * y_dis + x_dis * x_dis - if dis < minv: - mink = k - minv = dis - return mink, minv - - tk, tv = nearest(tables) - fk, fv = nearest(figures) - # if min(tv, fv) > 2000: - # i += 1 - # continue - if tv < fv and tk: - tables[tk].insert(0, c) - logging.debug("TABLE:" + self.boxes[i]["text"] + "; Cap: " + tk) - elif fk: - figures[fk].insert(0, c) - logging.debug("FIGURE:" + self.boxes[i]["text"] + "; Cap: " + tk) - self.boxes.pop(i) - - def cropout(bxs, ltype, poss): - nonlocal ZM - pn = set([b["page_number"] - 1 for b in bxs]) - if len(pn) < 2: - pn = list(pn)[0] - ht = self.page_cum_height[pn] - b = {"x0": np.min([b["x0"] for b in bxs]), "top": np.min([b["top"] for b in bxs]) - ht, "x1": np.max([b["x1"] for b in bxs]), "bottom": np.max([b["bottom"] for b in bxs]) - ht} - louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype] - ii = Recognizer.find_overlapped(b, louts, naive=True) - if ii is not None: - b = louts[ii] - else: - logging.warning(f"Missing layout match: {pn + 1},%s" % (bxs[0].get("layoutno", ""))) - - left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] - if right < left: - right = left + 1 - poss.append((pn + self.page_from, left, right, top, bott)) - return self.page_images[pn].crop((left * ZM, top * ZM, right * ZM, bott * ZM)) - pn = {} - for b in bxs: - p = b["page_number"] - 1 - if p not in pn: - pn[p] = [] - pn[p].append(b) - pn = sorted(pn.items(), key=lambda x: x[0]) - imgs = [cropout(arr, ltype, poss) for p, arr in pn] - pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245)) - height = 0 - for img in imgs: - pic.paste(img, (0, int(height))) - height += img.size[1] - return pic - - res = [] - positions = [] - figure_results = [] - figure_positions = [] - # crop figure out and add caption - for k, bxs in figures.items(): - txt = "\n".join([b["text"] for b in bxs]) - if not txt: - continue - - poss = [] - - if separate_tables_figures: - figure_results.append((cropout(bxs, "figure", poss), [txt])) - figure_positions.append(poss) - else: - res.append((cropout(bxs, "figure", poss), [txt])) - positions.append(poss) - - for k, bxs in tables.items(): - if not bxs: - continue - bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs])) - - poss = [] - - res.append((cropout(bxs, "table", poss), self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english))) - positions.append(poss) - - if separate_tables_figures: - assert len(positions) + len(figure_positions) == len(res) + len(figure_results) - if need_position: - return list(zip(res, positions)), list(zip(figure_results, figure_positions)) - else: - return res, figure_results - else: - assert len(positions) == len(res) - if need_position: - return list(zip(res, positions)) - else: - return res - - def proj_match(self, line): - if len(line) <= 2: - return - if re.match(r"[0-9 ().,%%+/-]+$", line): - return False - for p, j in [ - (r"第[零一二三四五六七八九十百]+章", 1), - (r"第[零一二三四五六七八九十百]+[条节]", 2), - (r"[零一二三四五六七八九十百]+[、  ]", 3), - (r"[\((][零一二三四五六七八九十百]+[)\)]", 4), - (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5), - (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6), - (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7), - (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8), - (r".{,48}[::??]$", 9), - (r"[0-9]+)", 10), - (r"[\((][0-9]+[)\)]", 11), - (r"[零一二三四五六七八九十百]+是", 12), - (r"[⚫•➢✓]", 12), - ]: - if re.match(p, line): - return j - return - - def _line_tag(self, bx, ZM): - pn = [bx["page_number"]] - top = bx["top"] - self.page_cum_height[pn[0] - 1] - bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] - page_images_cnt = len(self.page_images) - if pn[-1] - 1 >= page_images_cnt: - return "" - while bott * ZM > self.page_images[pn[-1] - 1].size[1]: - bott -= self.page_images[pn[-1] - 1].size[1] / ZM - pn.append(pn[-1] + 1) - if pn[-1] - 1 >= page_images_cnt: - return "" - - return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), bx["x0"], bx["x1"], top, bott) - - def __filterout_scraps(self, boxes, ZM): - def width(b): - return b["x1"] - b["x0"] - - def height(b): - return b["bottom"] - b["top"] - - def usefull(b): - if b.get("layout_type"): - return True - if width(b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3: - return True - if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]: - return True - return False - - res = [] - while boxes: - lines = [] - widths = [] - pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM - mh = self.mean_height[boxes[0]["page_number"] - 1] - mj = self.proj_match(boxes[0]["text"]) or boxes[0].get("layout_type", "") == "title" - - def dfs(line, st): - nonlocal mh, pw, lines, widths - lines.append(line) - widths.append(width(line)) - mmj = self.proj_match(line["text"]) or line.get("layout_type", "") == "title" - for i in range(st + 1, min(st + 20, len(boxes))): - if (boxes[i]["page_number"] - line["page_number"]) > 0: - break - if not mmj and self._y_dis(line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh: - break - - if not usefull(boxes[i]): - continue - if mmj or (self._x_dis(boxes[i], line) < pw / 10): - # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5): - # concat following - dfs(boxes[i], i) - boxes.pop(i) - break - - try: - if usefull(boxes[0]): - dfs(boxes[0], 0) - else: - logging.debug("WASTE: " + boxes[0]["text"]) - except Exception: - pass - boxes.pop(0) - mw = np.mean(widths) - if mj or mw / pw >= 0.35 or mw > 200: - res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines])) - else: - logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines])) - - return "\n\n".join(res) - - @staticmethod - def total_page_number(fnm, binary=None): - try: - with sys.modules[LOCK_KEY_pdfplumber]: - pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary)) - total_page = len(pdf.pages) - pdf.close() - return total_page - except Exception: - logging.exception("total_page_number") - - def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): - self.lefted_chars = [] - self.mean_height = [] - self.mean_width = [] - self.boxes = [] - self.garbages = {} - self.page_cum_height = [0] - self.page_layout = [] - self.page_from = page_from - - start = timer() - try: - with sys.modules[LOCK_KEY_pdfplumber]: - with pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) as pdf: - self.pdf = pdf - self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] - - try: - self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] - except Exception as e: - logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}") - self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead. - - self.total_page = len(self.pdf.pages) - - except Exception: - logging.exception("RAGFlowPdfParser __images__") - logging.info(f"__images__ dedupe_chars cost {timer() - start}s") - - # 如果使用 HTTP OCR,在获取图片和字符信息后调用 HTTP API 获取 OCR 结果 - if self.use_http_ocr: - try: - if callback: - callback(0.1, "Calling OCR HTTP service...") - - # 调用 HTTP OCR 服务 - if isinstance(fnm, str): - # 文件路径 - ocr_result = self.ocr_http_client.parse_pdf_by_path( - fnm, - page_from=page_from + 1, # HTTP API 使用从1开始的页码 - page_to=(page_to + 1) if page_to < 299 else 0, # 转换为从1开始,0 表示最后一页 - zoomin=zoomin - ) - else: - # 二进制数据 - ocr_result = self.ocr_http_client.parse_pdf_by_bytes( - fnm, - filename="document.pdf", - page_from=page_from + 1, - page_to=(page_to + 1) if page_to < 299 else 0, - zoomin=zoomin - ) - - # 将 HTTP API 返回的结果转换为 RAGFlow 格式 - self._convert_http_ocr_result(ocr_result, zoomin) - - if callback: - callback(0.4, "OCR HTTP service completed") - - except Exception as e: - logging.error(f"Failed to call OCR HTTP service: {e}", exc_info=True) - # 如果 HTTP OCR 失败,回退到空结果或抛出异常 - raise - - self.outlines = [] - try: - with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf: - self.pdf = pdf - - outlines = self.pdf.outline - - def dfs(arr, depth): - for a in arr: - if isinstance(a, dict): - self.outlines.append((a["/Title"], depth)) - continue - dfs(a, depth + 1) - - dfs(outlines, 0) - - except Exception as e: - logging.warning(f"Outlines exception: {e}") - - if not self.outlines: - logging.warning("Miss outlines") - - logging.debug("Images converted.") - self.is_english = [ - re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) - for i in range(len(self.page_chars)) - ] - if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: - self.is_english = True - else: - self.is_english = False - - async def __img_ocr(i, id, img, chars, limiter): - j = 0 - while j + 1 < len(chars): - if ( - chars[j]["text"] - and chars[j + 1]["text"] - and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) - and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"], chars[j]["width"]) / 2 - ): - chars[j]["text"] += " " - j += 1 - - if limiter: - async with limiter: - await trio.to_thread.run_sync(lambda: self.__ocr(i + 1, img, chars, zoomin, id)) - else: - self.__ocr(i + 1, img, chars, zoomin, id) - - if callback and i % 6 == 5: - callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") - - # 如果使用 HTTP OCR,已经在上面的代码中获取了结果,跳过本地 OCR - if not self.use_http_ocr: - async def __img_ocr_launcher(): - def __ocr_preprocess(): - chars = self.page_chars[i] if not self.is_english else [] - self.mean_height.append(np.median(sorted([c["height"] for c in chars])) if chars else 0) - self.mean_width.append(np.median(sorted([c["width"] for c in chars])) if chars else 8) - self.page_cum_height.append(img.size[1] / zoomin) - return chars - - if self.parallel_limiter: - async with trio.open_nursery() as nursery: - for i, img in enumerate(self.page_images): - chars = __ocr_preprocess() - - nursery.start_soon(__img_ocr, i, i % PARALLEL_DEVICES, img, chars, self.parallel_limiter[i % PARALLEL_DEVICES]) - await trio.sleep(0.1) - else: - for i, img in enumerate(self.page_images): - chars = __ocr_preprocess() - await __img_ocr(i, 0, img, chars, None) - - start = timer() - trio.run(__img_ocr_launcher) - else: - # HTTP OCR 模式:初始化 page_cum_height - for i, img in enumerate(self.page_images): - self.page_cum_height.append(img.size[1] / zoomin) - - logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s") - - if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: - bxes = [b for bxs in self.boxes for b in bxs] - self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) - - logging.debug("Is it English:", self.is_english) - - self.page_cum_height = np.cumsum(self.page_cum_height) - assert len(self.page_cum_height) == len(self.page_images) + 1 - if len(self.boxes) == 0 and zoomin < 9: - self.__images__(fnm, zoomin * 3, page_from, page_to, callback) - - def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): - self.__images__(fnm, zoomin) - self._layouts_rec(zoomin) - self._table_transformer_job(zoomin) - self._text_merge() - self._concat_downward() - self._filter_forpages() - tbls = self._extract_table_figure(need_image, zoomin, return_html, False) - return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls - - def parse_into_bboxes(self, fnm, callback=None, zoomin=3): - start = timer() - self.__images__(fnm, zoomin) - if callback: - callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start)) - - start = timer() - self._layouts_rec(zoomin) - if callback: - callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) - - start = timer() - self._table_transformer_job(zoomin) - if callback: - callback(0.83, "Table analysis ({:.2f}s)".format(timer() - start)) - - start = timer() - self._text_merge() - self._concat_downward() - self._naive_vertical_merge(zoomin) - if callback: - callback(0.92, "Text merged ({:.2f}s)".format(timer() - start)) - - start = timer() - tbls, figs = self._extract_table_figure(True, zoomin, True, True, True) - - def insert_table_figures(tbls_or_figs, layout_type): - def min_rectangle_distance(rect1, rect2): - import math - pn1, left1, right1, top1, bottom1 = rect1 - pn2, left2, right2, top2, bottom2 = rect2 - if right1 >= left2 and right2 >= left1 and bottom1 >= top2 and bottom2 >= top1: - return 0 - if right1 < left2: - dx = left2 - right1 - elif right2 < left1: - dx = left1 - right2 - else: - dx = 0 - if bottom1 < top2: - dy = top2 - bottom1 - elif bottom2 < top1: - dy = top1 - bottom2 - else: - dy = 0 - return math.sqrt(dx*dx + dy*dy)# + (pn2-pn1)*10000 - - for (img, txt), poss in tbls_or_figs: - bboxes = [(i, (b["page_number"], b["x0"], b["x1"], b["top"], b["bottom"])) for i, b in enumerate(self.boxes)] - dists = [(min_rectangle_distance((pn, left, right, top+self.page_cum_height[pn], bott+self.page_cum_height[pn]), rect),i) for i, rect in bboxes for pn, left, right, top, bott in poss] - min_i = np.argmin(dists, axis=0)[0] - min_i, rect = bboxes[dists[min_i][-1]] - if isinstance(txt, list): - txt = "\n".join(txt) - pn, left, right, top, bott = poss[0] - if self.boxes[min_i]["bottom"] < top+self.page_cum_height[pn]: - min_i += 1 - self.boxes.insert(min_i, { - "page_number": pn+1, "x0": left, "x1": right, "top": top+self.page_cum_height[pn], "bottom": bott+self.page_cum_height[pn], "layout_type": layout_type, "text": txt, "image": img, - "positions": [[pn+1, int(left), int(right), int(top), int(bott)]] - }) - - for b in self.boxes: - b["position_tag"] = self._line_tag(b, zoomin) - b["image"] = self.crop(b["position_tag"], zoomin) - b["positions"] = [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(b["position_tag"])] - - insert_table_figures(tbls, "table") - insert_table_figures(figs, "figure") - if callback: - callback(1, "Structured ({:.2f}s)".format(timer() - start)) - return deepcopy(self.boxes) - - @staticmethod - def remove_tag(txt): - return re.sub(r"@@[\t0-9.-]+?##", "", txt) - - @staticmethod - def extract_positions(txt): - poss = [] - for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt): - pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t") - left, right, top, bottom = float(left), float(right), float(top), float(bottom) - poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) - return poss - - def crop(self, text, ZM=3, need_position=False): - imgs = [] - poss = self.extract_positions(text) - if not poss: - if need_position: - return None, None - return - - max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) - GAP = 6 - pos = poss[0] - poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) - pos = poss[-1] - poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120))) - - positions = [] - for ii, (pns, left, right, top, bottom) in enumerate(poss): - right = left + max_width - bottom *= ZM - for pn in pns[1:]: - bottom += self.page_images[pn - 1].size[1] - imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1])))) - if 0 < ii < len(poss) - 1: - positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM)) - bottom -= self.page_images[pns[0]].size[1] - for pn in pns[1:]: - imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1])))) - if 0 < ii < len(poss) - 1: - positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM)) - bottom -= self.page_images[pn].size[1] - - if not imgs: - if need_position: - return None, None - return - height = 0 - for img in imgs: - height += img.size[1] + GAP - height = int(height) - width = int(np.max([i.size[0] for i in imgs])) - pic = Image.new("RGB", (width, height), (245, 245, 245)) - height = 0 - for ii, img in enumerate(imgs): - if ii == 0 or ii + 1 == len(imgs): - img = img.convert("RGBA") - overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) - overlay.putalpha(128) - img = Image.alpha_composite(img, overlay).convert("RGB") - pic.paste(img, (0, int(height))) - height += img.size[1] + GAP - - if need_position: - return pic, positions - return pic - - def get_position(self, bx, ZM): - poss = [] - pn = bx["page_number"] - top = bx["top"] - self.page_cum_height[pn - 1] - bott = bx["bottom"] - self.page_cum_height[pn - 1] - poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM))) - while bott * ZM > self.page_images[pn - 1].size[1]: - bott -= self.page_images[pn - 1].size[1] / ZM - top = 0 - pn += 1 - poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM))) - return poss - - -class PlainParser: - def __call__(self, filename, from_page=0, to_page=100000, **kwargs): - self.outlines = [] - lines = [] - try: - self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename)) - for page in self.pdf.pages[from_page:to_page]: - lines.extend([t for t in page.extract_text().split("\n")]) - - outlines = self.pdf.outline - - def dfs(arr, depth): - for a in arr: - if isinstance(a, dict): - self.outlines.append((a["/Title"], depth)) - continue - dfs(a, depth + 1) - - dfs(outlines, 0) - except Exception: - logging.exception("Outlines exception") - if not self.outlines: - logging.warning("Miss outlines") - - return [(line, "") for line in lines], [] - - def crop(self, ck, need_position): - raise NotImplementedError - - @staticmethod - def remove_tag(txt): - raise NotImplementedError - - -class VisionParser(RAGFlowPdfParser): - def __init__(self, vision_model, *args, **kwargs): - super().__init__(*args, **kwargs) - self.vision_model = vision_model - - def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): - try: - with sys.modules[LOCK_KEY_pdfplumber]: - self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) - self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] - self.total_page = len(self.pdf.pages) - except Exception: - self.page_images = None - self.total_page = 0 - logging.exception("VisionParser __images__") - - def __call__(self, filename, from_page=0, to_page=100000, **kwargs): - callback = kwargs.get("callback", lambda prog, msg: None) - zoomin = kwargs.get("zoomin", 3) - self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback) - - total_pdf_pages = self.total_page - - start_page = max(0, from_page) - end_page = min(to_page, total_pdf_pages) - - all_docs = [] - - for idx, img_binary in enumerate(self.page_images or []): - pdf_page_num = idx # 0-based - if pdf_page_num < start_page or pdf_page_num >= end_page: - continue - - text = picture_vision_llm_chunk( - binary=img_binary, - vision_model=self.vision_model, - prompt=vision_llm_describe_prompt(page=pdf_page_num + 1), - callback=callback, - ) - if kwargs.get("callback"): - kwargs["callback"](idx * 1.0 / len(self.page_images), f"Processed: {idx + 1}/{len(self.page_images)}") - - if text: - width, height = self.page_images[idx].size - all_docs.append((text, f"{pdf_page_num + 1} 0 {width / zoomin} 0 {height / zoomin}")) - return all_docs, [] - - -if __name__ == "__main__": - pass +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import os +import random +import re +import sys +import threading + + diff --git a/ocr/__init__.py b/ocr/__init__.py index 60ab8da..4dace37 100644 --- a/ocr/__init__.py +++ b/ocr/__init__.py @@ -32,22 +32,6 @@ import sys from pathlib import Path -try: - _package = __package__ -except NameError: - _package = None - -if _package is None: - # 直接运行时,添加父目录到路径并使用绝对导入 - parent_dir = Path(__file__).parent.parent - if str(parent_dir) not in sys.path: - sys.path.insert(0, str(parent_dir)) - from ocr.ocr import OCR, TextDetector, TextRecognizer - from ocr.pdf_parser import SimplePdfParser -else: - # 作为模块导入时使用相对导入 - from .ocr import OCR, TextDetector, TextRecognizer - from .pdf_parser import SimplePdfParser __all__ = ['OCR', 'TextDetector', 'TextRecognizer', 'SimplePdfParser'] diff --git a/ocr/api.py b/ocr/api.py index 771d38f..957ecbc 100644 --- a/ocr/api.py +++ b/ocr/api.py @@ -1,332 +1,525 @@ -# -# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -OCR PDF处理的FastAPI路由 -提供HTTP接口用于PDF的OCR识别 -""" - -import asyncio -import logging -import os -import sys -import tempfile -from pathlib import Path -from typing import Optional - -from fastapi import APIRouter, File, Form, HTTPException, UploadFile -from fastapi.responses import JSONResponse -from pydantic import BaseModel - -# 处理导入问题:支持直接运行和模块导入 - -try: - _package = __package__ -except NameError: - _package = None - -if _package is None: - # 直接运行时,添加父目录到路径并使用绝对导入 - parent_dir = Path(__file__).parent.parent - if str(parent_dir) not in sys.path: - sys.path.insert(0, str(parent_dir)) - from ocr.pdf_parser import SimplePdfParser - from ocr.config import MODEL_DIR -else: - # 作为模块导入时使用相对导入 - from pdf_parser import SimplePdfParser - from config import MODEL_DIR - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"]) - -# 全局解析器实例(懒加载) -_parser_instance: Optional[SimplePdfParser] = None - - -def get_parser() -> SimplePdfParser: - """获取全局解析器实例(单例模式)""" - global _parser_instance - if _parser_instance is None: - logger.info(f"Initializing OCR parser with model_dir={MODEL_DIR}") - _parser_instance = SimplePdfParser(model_dir=MODEL_DIR) - return _parser_instance - - -class ParseResponse(BaseModel): - """解析响应模型""" - success: bool - message: str - data: Optional[dict] = None - - -@router.get( - "/health", - summary="健康检查", - description="检查OCR服务的健康状态和配置信息", - response_description="返回服务状态和模型目录信息" -) -async def health_check(): - """ - 健康检查端点 - - 用于检查OCR服务的运行状态和配置信息。 - - Returns: - dict: 包含服务状态和模型目录的信息 - """ - return { - "status": "healthy", - "service": "OCR PDF Parser", - "model_dir": MODEL_DIR - } - - -@router.post( - "/parse", - response_model=ParseResponse, - summary="上传并解析PDF文件", - description="上传PDF文件并通过OCR识别提取文本内容", - response_description="返回OCR识别结果" -) -async def parse_pdf_endpoint( - file: UploadFile = File(..., description="PDF文件,支持上传任意PDF文档"), - page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"), - page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"), - zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)") -): - """ - 上传并解析PDF文件 - - 通过上传PDF文件,使用OCR技术识别并提取其中的文本内容。 - 支持指定解析的页码范围,以及调整图像放大倍数以平衡识别精度和速度。 - - Args: - file: 上传的PDF文件(multipart/form-data格式) - page_from: 起始页码(从1开始,最小值为1) - page_to: 结束页码(0表示解析到最后一页,最小值为0) - zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢) - - Returns: - ParseResponse: 包含解析结果的响应对象,包括: - - success: 是否成功 - - message: 操作结果消息 - - data: OCR识别的文本内容和元数据 - - Raises: - HTTPException: 400 - 如果文件不是PDF格式或文件为空 - HTTPException: 500 - 如果解析过程中发生错误 - """ - if not file.filename.lower().endswith('.pdf'): - raise HTTPException(status_code=400, detail="只支持PDF文件") - - # 保存上传的文件到临时目录 - temp_file = None - try: - # 读取文件内容 - content = await file.read() - if not content: - raise HTTPException(status_code=400, detail="文件为空") - - # 创建临时文件 - with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: - tmp.write(content) - temp_file = tmp.name - - logger.info(f"Parsing PDF file: {file.filename}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}") - - # 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行) - parser = get_parser() - result = await asyncio.to_thread( - parser.parse_pdf, - temp_file, - zoomin, - page_from - 1, # 转换为从0开始的索引 - (page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引 - None # callback - ) - - return ParseResponse( - success=True, - message=f"成功解析PDF: {file.filename}", - data=result - ) - - except Exception as e: - logger.error(f"Error parsing PDF: {str(e)}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"解析PDF时发生错误: {str(e)}" - ) - - finally: - # 清理临时文件 - if temp_file and os.path.exists(temp_file): - try: - os.unlink(temp_file) - except Exception as e: - logger.warning(f"Failed to delete temp file {temp_file}: {e}") - - -@router.post( - "/parse/bytes", - response_model=ParseResponse, - summary="通过二进制数据解析PDF", - description="直接通过二进制数据解析PDF文件,无需上传文件", - response_description="返回OCR识别结果" -) -async def parse_pdf_bytes( - pdf_bytes: bytes = File(..., description="PDF文件的二进制数据(multipart/form-data格式)"), - filename: str = Form("document.pdf", description="文件名(仅用于日志记录,不影响解析)"), - page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"), - page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"), - zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)") -): - """ - 直接通过二进制数据解析PDF - - 适用于已获取PDF二进制数据的场景,无需文件上传步骤。 - 直接将PDF的二进制数据提交即可进行OCR识别。 - - Args: - pdf_bytes: PDF文件的二进制数据(以文件形式提交) - filename: 文件名(仅用于日志记录,不影响实际解析过程) - page_from: 起始页码(从1开始,最小值为1) - page_to: 结束页码(0表示解析到最后一页,最小值为0) - zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢) - - Returns: - ParseResponse: 包含解析结果的响应对象 - - Raises: - HTTPException: 400 - 如果PDF数据为空 - HTTPException: 500 - 如果解析过程中发生错误 - """ - if not pdf_bytes: - raise HTTPException(status_code=400, detail="PDF数据为空") - - # 保存到临时文件 - temp_file = None - try: - with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: - tmp.write(pdf_bytes) - temp_file = tmp.name - - logger.info(f"Parsing PDF bytes (filename: {filename}), pages {page_from}-{page_to or 'end'}, zoomin={zoomin}") - - # 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行) - parser = get_parser() - result = await asyncio.to_thread( - parser.parse_pdf, - temp_file, - zoomin, - page_from - 1, # 转换为从0开始的索引 - (page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引 - None # callback - ) - - return ParseResponse( - success=True, - message=f"成功解析PDF: {filename}", - data=result - ) - - except Exception as e: - logger.error(f"Error parsing PDF bytes: {str(e)}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"解析PDF时发生错误: {str(e)}" - ) - - finally: - # 清理临时文件 - if temp_file and os.path.exists(temp_file): - try: - os.unlink(temp_file) - except Exception as e: - logger.warning(f"Failed to delete temp file {temp_file}: {e}") - - -@router.post( - "/parse/path", - response_model=ParseResponse, - summary="通过文件路径解析PDF", - description="通过服务器本地文件路径解析PDF文件", - response_description="返回OCR识别结果" -) -async def parse_pdf_path( - file_path: str = Form(..., description="PDF文件在服务器上的本地路径(必须是可访问的绝对路径)"), - page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"), - page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"), - zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)") -): - """ - 通过文件路径解析PDF - - 适用于PDF文件已经存在于服务器上的场景。 - 通过提供文件路径直接进行OCR识别,无需上传文件。 - - Args: - file_path: PDF文件在服务器上的本地路径(必须是服务器可访问的绝对路径) - page_from: 起始页码(从1开始,最小值为1) - page_to: 结束页码(0表示解析到最后一页,最小值为0) - zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢) - - Returns: - ParseResponse: 包含解析结果的响应对象 - - Raises: - HTTPException: 400 - 如果文件不是PDF格式 - HTTPException: 404 - 如果文件不存在 - HTTPException: 500 - 如果解析过程中发生错误 - - Note: - 此端点需要确保提供的文件路径在服务器上可访问。 - 建议仅在内网环境或受信任的环境中使用,避免路径遍历安全风险。 - """ - if not os.path.exists(file_path): - raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}") - - if not file_path.lower().endswith('.pdf'): - raise HTTPException(status_code=400, detail="只支持PDF文件") - - try: - logger.info(f"Parsing PDF from path: {file_path}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}") - - # 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行) - parser = get_parser() - result = await asyncio.to_thread( - parser.parse_pdf, - file_path, - zoomin, - page_from - 1, # 转换为从0开始的索引 - (page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引 - None # callback - ) - - return ParseResponse( - success=True, - message=f"成功解析PDF: {file_path}", - data=result - ) - - except Exception as e: - logger.error(f"Error parsing PDF from path: {str(e)}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"解析PDF时发生错误: {str(e)}" - ) - +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +OCR PDF处理的FastAPI路由 +提供HTTP接口用于PDF的OCR识别 +""" + +import asyncio +import logging +import os +import sys +import tempfile +from pathlib import Path +from typing import Optional + +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +from ocr import SimplePdfParser +from ocr.config import MODEL_DIR + +logger = logging.getLogger(__name__) + +ocr_router = APIRouter(prefix="/api/v1/ocr", tags=["OCR"]) + +# 全局解析器实例(懒加载) +_parser_instance: Optional[SimplePdfParser] = None + + +def get_parser() -> SimplePdfParser: + """获取全局解析器实例(单例模式)""" + global _parser_instance + if _parser_instance is None: + logger.info(f"Initializing OCR parser with model_dir={MODEL_DIR}") + _parser_instance = SimplePdfParser(model_dir=MODEL_DIR) + return _parser_instance + + +class ParseResponse(BaseModel): + """解析响应模型""" + success: bool + message: str + data: Optional[dict] = None + + +@router.get( + "/health", + summary="健康检查", + description="检查OCR服务的健康状态和配置信息", + response_description="返回服务状态和模型目录信息" +) +async def health_check(): + """ + 健康检查端点 + + 用于检查OCR服务的运行状态和配置信息。 + + Returns: + dict: 包含服务状态和模型目录的信息 + """ + return { + "status": "healthy", + "service": "OCR PDF Parser", + "model_dir": MODEL_DIR + } + + +@router.post( + "/parse", + response_model=ParseResponse, + summary="上传并解析PDF文件", + description="上传PDF文件并通过OCR识别提取文本内容", + response_description="返回OCR识别结果" +) +async def parse_pdf_endpoint( + file: UploadFile = File(..., description="PDF文件,支持上传任意PDF文档"), + page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"), + page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"), + zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)") +): + """ + 上传并解析PDF文件 + + 通过上传PDF文件,使用OCR技术识别并提取其中的文本内容。 + 支持指定解析的页码范围,以及调整图像放大倍数以平衡识别精度和速度。 + + Args: + file: 上传的PDF文件(multipart/form-data格式) + page_from: 起始页码(从1开始,最小值为1) + page_to: 结束页码(0表示解析到最后一页,最小值为0) + zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢) + + Returns: + ParseResponse: 包含解析结果的响应对象,包括: + - success: 是否成功 + - message: 操作结果消息 + - data: OCR识别的文本内容和元数据 + + Raises: + HTTPException: 400 - 如果文件不是PDF格式或文件为空 + HTTPException: 500 - 如果解析过程中发生错误 + """ + if not file.filename.lower().endswith('.pdf'): + raise HTTPException(status_code=400, detail="只支持PDF文件") + + # 保存上传的文件到临时目录 + temp_file = None + try: + # 读取文件内容 + content = await file.read() + if not content: + raise HTTPException(status_code=400, detail="文件为空") + + # 创建临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: + tmp.write(content) + temp_file = tmp.name + + logger.info(f"Parsing PDF file: {file.filename}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}") + + # 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行) + parser = get_parser() + result = await asyncio.to_thread( + parser.parse_pdf, + temp_file, + zoomin, + page_from - 1, # 转换为从0开始的索引 + (page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引 + None # callback + ) + + return ParseResponse( + success=True, + message=f"成功解析PDF: {file.filename}", + data=result + ) + + except Exception as e: + logger.error(f"Error parsing PDF: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"解析PDF时发生错误: {str(e)}" + ) + + finally: + # 清理临时文件 + if temp_file and os.path.exists(temp_file): + try: + os.unlink(temp_file) + except Exception as e: + logger.warning(f"Failed to delete temp file {temp_file}: {e}") + + +@router.post( + "/parse/bytes", + response_model=ParseResponse, + summary="通过二进制数据解析PDF", + description="直接通过二进制数据解析PDF文件,无需上传文件", + response_description="返回OCR识别结果" +) +async def parse_pdf_bytes( + pdf_bytes: bytes = File(..., description="PDF文件的二进制数据(multipart/form-data格式)"), + filename: str = Form("document.pdf", description="文件名(仅用于日志记录,不影响解析)"), + page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"), + page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"), + zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)") +): + """ + 直接通过二进制数据解析PDF + + 适用于已获取PDF二进制数据的场景,无需文件上传步骤。 + 直接将PDF的二进制数据提交即可进行OCR识别。 + + Args: + pdf_bytes: PDF文件的二进制数据(以文件形式提交) + filename: 文件名(仅用于日志记录,不影响实际解析过程) + page_from: 起始页码(从1开始,最小值为1) + page_to: 结束页码(0表示解析到最后一页,最小值为0) + zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢) + + Returns: + ParseResponse: 包含解析结果的响应对象 + + Raises: + HTTPException: 400 - 如果PDF数据为空 + HTTPException: 500 - 如果解析过程中发生错误 + """ + if not pdf_bytes: + raise HTTPException(status_code=400, detail="PDF数据为空") + + # 保存到临时文件 + temp_file = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: + tmp.write(pdf_bytes) + temp_file = tmp.name + + logger.info(f"Parsing PDF bytes (filename: {filename}), pages {page_from}-{page_to or 'end'}, zoomin={zoomin}") + + # 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行) + parser = get_parser() + result = await asyncio.to_thread( + parser.parse_pdf, + temp_file, + zoomin, + page_from - 1, # 转换为从0开始的索引 + (page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引 + None # callback + ) + + return ParseResponse( + success=True, + message=f"成功解析PDF: {filename}", + data=result + ) + + except Exception as e: + logger.error(f"Error parsing PDF bytes: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"解析PDF时发生错误: {str(e)}" + ) + + finally: + # 清理临时文件 + if temp_file and os.path.exists(temp_file): + try: + os.unlink(temp_file) + except Exception as e: + logger.warning(f"Failed to delete temp file {temp_file}: {e}") + + +@router.post( + "/parse/path", + response_model=ParseResponse, + summary="通过文件路径解析PDF", + description="通过服务器本地文件路径解析PDF文件", + response_description="返回OCR识别结果" +) +async def parse_pdf_path( + file_path: str = Form(..., description="PDF文件在服务器上的本地路径(必须是可访问的绝对路径)"), + page_from: int = Form(1, ge=1, description="起始页码(从1开始,默认为1)"), + page_to: int = Form(0, ge=0, description="结束页码(0表示解析到最后一页,默认为0)"), + zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,数值越大识别精度越高但速度越慢,默认为3)") +): + """ + 通过文件路径解析PDF + + 适用于PDF文件已经存在于服务器上的场景。 + 通过提供文件路径直接进行OCR识别,无需上传文件。 + + Args: + file_path: PDF文件在服务器上的本地路径(必须是服务器可访问的绝对路径) + page_from: 起始页码(从1开始,最小值为1) + page_to: 结束页码(0表示解析到最后一页,最小值为0) + zoomin: 图像放大倍数(1-5之间,数值越大识别精度越高但处理速度越慢) + + Returns: + ParseResponse: 包含解析结果的响应对象 + + Raises: + HTTPException: 400 - 如果文件不是PDF格式 + HTTPException: 404 - 如果文件不存在 + HTTPException: 500 - 如果解析过程中发生错误 + + Note: + 此端点需要确保提供的文件路径在服务器上可访问。 + 建议仅在内网环境或受信任的环境中使用,避免路径遍历安全风险。 + """ + if not os.path.exists(file_path): + raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}") + + if not file_path.lower().endswith('.pdf'): + raise HTTPException(status_code=400, detail="只支持PDF文件") + + try: + logger.info(f"Parsing PDF from path: {file_path}, pages {page_from}-{page_to or 'end'}, zoomin={zoomin}") + + # 解析PDF(parse_pdf是同步方法,使用to_thread在线程池中执行) + parser = get_parser() + result = await asyncio.to_thread( + parser.parse_pdf, + file_path, + zoomin, + page_from - 1, # 转换为从0开始的索引 + (page_to - 1) if page_to > 0 else 299, # 转换为从0开始的索引 + None # callback + ) + + return ParseResponse( + success=True, + message=f"成功解析PDF: {file_path}", + data=result + ) + + except Exception as e: + logger.error(f"Error parsing PDF from path: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"解析PDF时发生错误: {str(e)}" + ) + + +@router.post( + "/parse_into_bboxes", + summary="解析PDF并返回边界框", + description="解析PDF文件并返回文本边界框信息,用于文档结构化处理", + response_description="返回包含文本边界框的列表" +) +async def parse_into_bboxes_endpoint( + pdf_bytes: bytes = File(..., description="PDF文件的二进制数据"), + filename: str = Form("document.pdf", description="文件名(仅用于日志)"), + zoomin: int = Form(3, ge=1, le=5, description="图像放大倍数(1-5,默认为3)") +): + """ + 解析PDF并返回边界框 + + 此接口用于将PDF文档解析为结构化文本边界框,每个边界框包含: + - 文本内容 + - 页面编号 + - 坐标信息(x0, x1, top, bottom) + - 布局类型(如 text, table, figure 等) + - 图像数据(如果有) + + Args: + pdf_bytes: PDF文件的二进制数据 + filename: 文件名(仅用于日志记录) + zoomin: 图像放大倍数(1-5之间) + + Returns: + dict: 包含解析结果的对象,data字段为边界框列表 + + Raises: + HTTPException: 400 - 如果PDF数据为空 + HTTPException: 500 - 如果解析过程中发生错误 + """ + if not pdf_bytes: + raise HTTPException(status_code=400, detail="PDF数据为空") + + temp_file = None + try: + # 保存到临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: + tmp.write(pdf_bytes) + temp_file = tmp.name + + logger.info(f"Parsing PDF into bboxes: {filename}, zoomin={zoomin}") + + # 定义一个简单的callback包装器,用于处理进度回调(记录日志) + def progress_callback(prog, msg): + logger.info(f"Progress: {prog:.2%} - {msg}") + + parser = get_parser() + result = await asyncio.to_thread( + parser.parse_into_bboxes, + temp_file, + progress_callback, + zoomin + ) + + # 将图像数据转换为base64或None + processed_result = [] + for bbox in result: + processed_bbox = dict(bbox) + # 如果有图像,转换为base64(如果需要的话,可以在这里处理) + # 但为了保持兼容性,我们保留原始格式 + processed_result.append(processed_bbox) + + return ParseResponse( + success=True, + message=f"成功解析PDF为边界框: {filename}", + data={"bboxes": processed_result} + ) + + except Exception as e: + logger.error(f"Error parsing PDF into bboxes: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"解析PDF为边界框时发生错误: {str(e)}" + ) + + finally: + # 清理临时文件 + if temp_file and os.path.exists(temp_file): + try: + os.unlink(temp_file) + except Exception as e: + logger.warning(f"Failed to delete temp file {temp_file}: {e}") + + +class TextRequest(BaseModel): + """文本处理请求模型""" + text: str = Field(..., description="需要处理的文本内容") + + +class RemoveTagResponse(BaseModel): + """移除标签响应模型""" + success: bool + message: str + text: Optional[str] = None + + +@router.post( + "/remove_tag", + response_model=RemoveTagResponse, + summary="移除文本中的位置标签", + description="从文本中移除PDF解析生成的位置标签(格式:@@页码\t坐标##)", + response_description="返回移除标签后的文本" +) +async def remove_tag_endpoint(request: TextRequest): + """ + 移除文本中的位置标签 + + 此接口用于从包含位置标签的文本中移除标签信息。 + 位置标签格式为:@@页码\t坐标##,例如:@@1\t100.0\t200.0\t50.0\t60.0## + + Args: + request: 包含待处理文本的请求对象 + + Returns: + RemoveTagResponse: 包含处理结果的响应对象 + + Raises: + HTTPException: 400 - 如果文本为空 + """ + if not request.text: + raise HTTPException(status_code=400, detail="文本内容不能为空") + + try: + cleaned_text = SimplePdfParser.remove_tag(request.text) + + return RemoveTagResponse( + success=True, + message="成功移除文本标签", + text=cleaned_text + ) + + except Exception as e: + logger.error(f"Error removing tag: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"移除标签时发生错误: {str(e)}" + ) + + +class ExtractPositionsResponse(BaseModel): + """提取位置信息响应模型""" + success: bool + message: str + positions: Optional[list] = None + + +@router.post( + "/extract_positions", + response_model=ExtractPositionsResponse, + summary="从文本中提取位置信息", + description="从包含位置标签的文本中提取所有位置坐标信息", + response_description="返回提取到的位置信息列表" +) +async def extract_positions_endpoint(request: TextRequest): + """ + 从文本中提取位置信息 + + 此接口用于从包含位置标签的文本中提取所有位置坐标信息。 + 位置标签格式为:@@页码\t坐标## + + 返回的位置信息格式为: + [ + ([页码列表], left, right, top, bottom), + ... + ] + + Args: + request: 包含待处理文本的请求对象 + + Returns: + ExtractPositionsResponse: 包含提取结果的响应对象 + + Raises: + HTTPException: 400 - 如果文本为空 + """ + if not request.text: + raise HTTPException(status_code=400, detail="文本内容不能为空") + + try: + positions = SimplePdfParser.extract_positions(request.text) + + # 将位置信息转换为可序列化的格式 + serializable_positions = [ + { + "page_numbers": pos[0], + "left": pos[1], + "right": pos[2], + "top": pos[3], + "bottom": pos[4] + } + for pos in positions + ] + + return ExtractPositionsResponse( + success=True, + message=f"成功提取 {len(positions)} 个位置信息", + positions=serializable_positions + ) + + except Exception as e: + logger.error(f"Error extracting positions: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"提取位置信息时发生错误: {str(e)}" + ) \ No newline at end of file diff --git a/ocr/main.py b/ocr/main.py index 52f18e7..b03dacf 100644 --- a/ocr/main.py +++ b/ocr/main.py @@ -29,25 +29,8 @@ import uvicorn from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -# 处理直接运行时的导入问题 -# 当直接运行 python ocr/main.py 时,__package__ 为 None -# 当作为模块运行时(python -m ocr.main),__package__ 为 'ocr' -try: - _package = __package__ -except NameError: - _package = None - -if _package is None: - # 直接运行脚本时,添加父目录到路径 - parent_dir = Path(__file__).parent.parent - if str(parent_dir) not in sys.path: - sys.path.insert(0, str(parent_dir)) - from api import router as ocr_router - from config import MODEL_DIR -else: - # 作为模块导入时使用相对导入 - from api import router as ocr_router - from config import MODEL_DIR +from ocr.api import ocr_router +from ocr.config import MODEL_DIR # 配置日志 logging.basicConfig( diff --git a/ocr/pdf_parser.py b/ocr/pdf_parser.py index 800c6e4..9a954a1 100644 --- a/ocr/pdf_parser.py +++ b/ocr/pdf_parser.py @@ -1,339 +1,1319 @@ -# -# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -简化的PDF解析器,只使用OCR处理PDF文档 - -从 RAGFlow 的 RAGFlowPdfParser 中提取OCR相关功能,移除了: -- 布局识别(Layout Recognition) -- 表格结构识别(Table Structure Recognition) -- 文本合并和语义分析 -- RAG相关功能 - -只保留: -- PDF转图片 -- OCR文本检测和识别 -- 基本的文本和位置信息返回 -""" - -import logging -import sys -import threading -from io import BytesIO -from pathlib import Path -from timeit import default_timer as timer - -import numpy as np -import pdfplumber -import trio - -# 处理导入问题:支持直接运行和模块导入 -try: - _package = __package__ -except NameError: - _package = None - -if _package is None: - # 直接运行时,添加父目录到路径并使用绝对导入 - parent_dir = Path(__file__).parent.parent - if str(parent_dir) not in sys.path: - sys.path.insert(0, str(parent_dir)) - from ocr.config import PARALLEL_DEVICES - from ocr.ocr import OCR -else: - # 作为模块导入时使用相对导入 - from config import PARALLEL_DEVICES - from ocr import OCR - -LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" -if LOCK_KEY_pdfplumber not in sys.modules: - sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() - - -class SimplePdfParser: - """ - 简化的PDF解析器,只使用OCR处理PDF - - 使用方法: - parser = SimplePdfParser() - result = parser.parse_pdf("file.pdf") # 或传入二进制数据 - # result 格式: - # { - # "pages": [ - # { - # "page_number": 1, - # "boxes": [ - # { - # "text": "识别到的文本", - # "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], - # "confidence": 0.95 - # }, - # ... - # ] - # }, - # ... - # ] - # } - """ - - def __init__(self, model_dir=None): - """ - 初始化PDF解析器 - - Args: - model_dir: OCR模型目录,如果为None则使用默认路径 - """ - self.ocr = OCR(model_dir=model_dir) - self.parallel_limiter = None - if PARALLEL_DEVICES > 1: - self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)] - - def __ocr_page(self, page_num, img, zoomin=3, device_id=None): - """ - 对单页进行OCR处理 - - Args: - page_num: 页码 - img: PIL Image对象 - zoomin: 放大倍数(用于坐标缩放) - device_id: GPU设备ID - - Returns: - list: OCR结果列表,每个元素为 {"text": str, "bbox": list, "confidence": float} - """ - start = timer() - img_np = np.array(img) - - # 文本检测 - # detect方法返回: zip对象,格式为 (box_coords, (text, score)) - # 但检测阶段text和score都是默认值,需要后续识别 - detection_result = self.ocr.detect(img_np, device_id) - - if detection_result is None: - return [] - - # 转换为列表并提取box坐标 - # detect返回的格式是zip,每个元素是 (box_coords, (text, score)) - # 在检测阶段,text是空字符串,score是0 - bxs = list(detection_result) - - logging.info(f"Page {page_num}: OCR detection found {len(bxs)} boxes in {timer() - start:.2f}s") - - if not bxs: - return [] - - # 解析检测结果并准备识别 - boxes_to_reg = [] - - start = timer() - for box_coords, _, _ in bxs: - # box_coords 是四边形坐标: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]] - # 转换为原始坐标(考虑zoomin) - box_coords_np = np.array(box_coords, dtype=np.float32) - original_coords = box_coords_np / zoomin # 缩放回原始坐标 - - # 裁剪图像用于识别 - # 使用放大后的坐标裁剪(因为img_np是放大后的图像) - crop_box = box_coords_np - crop_img = self.ocr.get_rotate_crop_image(img_np, crop_box) - boxes_to_reg.append({ - "bbox": original_coords.tolist(), - "crop_img": crop_img - }) - - # 批量识别文本 - ocr_results = [] - if boxes_to_reg: - crop_imgs = [b["crop_img"] for b in boxes_to_reg] - texts = self.ocr.recognize_batch(crop_imgs, device_id) - - # 组装结果 - for i, b in enumerate(boxes_to_reg): - if i < len(texts) and texts[i]: # 过滤空文本 - ocr_results.append({ - "text": texts[i], - "bbox": b["bbox"], - "confidence": 0.9 # 简化版本,不计算具体置信度 - }) - - logging.info(f"Page {page_num}: OCR recognition {len(ocr_results)} boxes cost {timer() - start:.2f}s") - return ocr_results - - async def __ocr_page_async(self, page_num, img, zoomin, device_id, limiter, callback): - """ - 异步OCR处理单页 - - Args: - page_num: 页码 - img: PIL Image对象 - zoomin: 放大倍数 - device_id: GPU设备ID - limiter: 并发限制器 - callback: 进度回调函数 - """ - if limiter: - async with limiter: - result = await trio.to_thread.run_sync( - lambda: self.__ocr_page(page_num, img, zoomin, device_id) - ) - else: - result = await trio.to_thread.run_sync( - lambda: self.__ocr_page(page_num, img, zoomin, device_id) - ) - - if callback and page_num % 5 == 0: - callback(prog=page_num * 0.9 / 100, msg=f"Processing page {page_num}...") - - return result - - def __convert_pdf_to_images(self, pdf_source, zoomin=3, page_from=0, page_to=299): - """ - 将PDF转换为图片 - - Args: - pdf_source: PDF文件路径(str)或二进制数据(bytes) - zoomin: 放大倍数,默认3(72*3=216 DPI) - page_from: 起始页码(从0开始) - page_to: 结束页码 - - Returns: - list: PIL Image对象列表 - """ - start = timer() - page_images = [] - - try: - with sys.modules[LOCK_KEY_pdfplumber]: - pdf = pdfplumber.open(pdf_source) if isinstance(pdf_source, str) else pdfplumber.open(BytesIO(pdf_source)) - try: - # 转换为图片,resolution = 72 * zoomin - page_images = [ - p.to_image(resolution=72 * zoomin, antialias=True).annotated - for i, p in enumerate(pdf.pages[page_from:page_to]) - ] - pdf.close() - except Exception as e: - logging.warning(f"Failed to convert PDF pages {page_from}-{page_to}: {str(e)}") - if hasattr(pdf, 'close'): - pdf.close() - except Exception as e: - logging.exception(f"Error converting PDF to images: {str(e)}") - - logging.info(f"Converted {len(page_images)} pages to images in {timer() - start:.2f}s") - return page_images - - def parse_pdf(self, pdf_source, zoomin=3, page_from=0, page_to=299, callback=None): - """ - 解析PDF文档,使用OCR识别文本 - - Args: - pdf_source: PDF文件路径(str)或二进制数据(bytes) - zoomin: 放大倍数,默认3 - page_from: 起始页码(从0开始) - page_to: 结束页码 - callback: 进度回调函数,格式: callback(prog: float, msg: str) - - Returns: - dict: 解析结果 - { - "pages": [ - { - "page_number": int, - "boxes": [ - { - "text": str, - "bbox": [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], - "confidence": float - }, - ... - ] - }, - ... - ] - } - """ - if callback: - callback(0.0, "Starting PDF parsing...") - - # 1. 转换为图片 - if callback: - callback(0.1, "Converting PDF to images...") - page_images = self.__convert_pdf_to_images(pdf_source, zoomin, page_from, page_to) - - if not page_images: - logging.warning("No pages converted from PDF") - return {"pages": []} - - # 2. OCR处理 - async def process_all_pages(): - pages_result = [] - - if self.parallel_limiter: - # 并行处理(多GPU) - async with trio.open_nursery() as nursery: - tasks = [] - for i, img in enumerate(page_images): - page_num = page_from + i + 1 - device_id = i % PARALLEL_DEVICES - task = nursery.start_soon( - self.__ocr_page_async, - page_num, img, zoomin, device_id, - self.parallel_limiter[device_id], callback - ) - tasks.append(task) - - # 等待所有任务完成并收集结果 - for i, task in enumerate(tasks): - result = await task - pages_result.append({ - "page_number": page_from + i + 1, - "boxes": result - }) - else: - # 串行处理(单GPU或CPU) - for i, img in enumerate(page_images): - page_num = page_from + i + 1 - result = await trio.to_thread.run_sync( - lambda img=img, pn=page_num: self.__ocr_page(pn, img, zoomin, 0) - ) - pages_result.append({ - "page_number": page_num, - "boxes": result - }) - if callback: - callback(0.1 + (i + 1) * 0.9 / len(page_images), f"Processing page {page_num}...") - - return pages_result - - # 运行异步处理 - if callback: - callback(0.2, "Starting OCR processing...") - - start = timer() - pages_result = trio.run(process_all_pages) - logging.info(f"OCR processing completed in {timer() - start:.2f}s") - - if callback: - callback(1.0, "OCR processing completed") - - return { - "pages": pages_result - } - - -# 向后兼容的别名 -PdfParser = SimplePdfParser - +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +简化的PDF解析器,只使用OCR处理PDF文档 + +从 RAGFlow 的 RAGFlowPdfParser 中提取OCR相关功能,移除了: +- 布局识别(Layout Recognition) +- 表格结构识别(Table Structure Recognition) +- 文本合并和语义分析 +- RAG相关功能 + +只保留: +- PDF转图片 +- OCR文本检测和识别 +- 基本的文本和位置信息返回 +""" +from pathlib import Path +from tkinter import Image +from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk +from pypdf import PdfReader as pdf2_read +from huggingface_hub import snapshot_download +import logging +import os +import random +import re +import sys +import threading +from copy import deepcopy +from io import BytesIO +from timeit import default_timer as timer + +import numpy as np +import pdfplumber +import trio +import xgboost as xgb +from agent import settings +from deepdoc.vision import AscendLayoutRecognizer, TableStructureRecognizer, Recognizer +from deepdoc.vision.layout_recognizer import LayoutRecognizer +from ocr.utils import get_project_base_directory +from rag.nlp import rag_tokenizer +from rag.prompts.generator import vision_llm_describe_prompt + +# 处理导入问题:支持直接运行和模块导入 +try: + _package = __package__ +except NameError: + _package = None + +if _package is None: + # 直接运行时,添加父目录到路径并使用绝对导入 + parent_dir = Path(__file__).parent.parent + if str(parent_dir) not in sys.path: + sys.path.insert(0, str(parent_dir)) + from ocr.config import PARALLEL_DEVICES + from ocr.ocr import OCR +else: + # 作为模块导入时使用相对导入 + from config import PARALLEL_DEVICES + from ocr import OCR + +LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" +if LOCK_KEY_pdfplumber not in sys.modules: + sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() + + +class SimplePdfParser: + def __init__(self, **kwargs): + """ + If you have trouble downloading HuggingFace models, -_^ this might help!! + + For Linux: + export HF_ENDPOINT=https://hf-mirror.com + + For Windows: + Good luck + ^_- + + """ + + self.ocr = OCR() + self.parallel_limiter = None + if PARALLEL_DEVICES > 1: + self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)] + + layout_recognizer_type = os.getenv("LAYOUT_RECOGNIZER_TYPE", "onnx").lower() + if layout_recognizer_type not in ["onnx", "ascend"]: + raise RuntimeError("Unsupported layout recognizer type.") + + if hasattr(self, "model_speciess"): + recognizer_domain = "layout." + self.model_speciess + else: + recognizer_domain = "layout" + + if layout_recognizer_type == "ascend": + logging.debug("Using Ascend LayoutRecognizer") + self.layouter = AscendLayoutRecognizer(recognizer_domain) + else: # onnx + logging.debug("Using Onnx LayoutRecognizer") + self.layouter = LayoutRecognizer(recognizer_domain) + self.tbl_det = TableStructureRecognizer() + + self.updown_cnt_mdl = xgb.Booster() + if not settings.LIGHTEN: + try: + import torch.cuda + + if torch.cuda.is_available(): + self.updown_cnt_mdl.set_param({"device": "cuda"}) + except Exception: + logging.exception("RAGFlowPdfParser __init__") + try: + model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc") + self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model")) + except Exception: + model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), local_dir_use_symlinks=False) + self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model")) + + self.page_from = 0 + self.column_num = 1 + + def __char_width(self, c): + return (c["x1"] - c["x0"]) // max(len(c["text"]), 1) + + def __height(self, c): + return c["bottom"] - c["top"] + + def _x_dis(self, a, b): + return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]), abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2) + + def _y_dis(self, a, b): + return (b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2 + + def _match_proj(self, b): + proj_patt = [ + r"第[零一二三四五六七八九十百]+章", + r"第[零一二三四五六七八九十百]+[条节]", + r"[零一二三四五六七八九十百]+[、是  ]", + r"[\((][零一二三四五六七八九十百]+[)\)]", + r"[\((][0-9]+[)\)]", + r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})", + r"[0-9]+\.[0-9.]+(、|\.[  ])", + r"[⚫•➢①② ]", + ] + return any([re.match(p, b["text"]) for p in proj_patt]) + + def _updown_concat_features(self, up, down): + w = max(self.__char_width(up), self.__char_width(down)) + h = max(self.__height(up), self.__height(down)) + y_dis = self._y_dis(up, down) + LEN = 6 + tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split() + tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split() + tks_all = up["text"][-LEN:].strip() + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") + down["text"][:LEN].strip() + tks_all = rag_tokenizer.tokenize(tks_all).split() + fea = [ + up.get("R", -1) == down.get("R", -1), + y_dis / h, + down["page_number"] - up["page_number"], + up["layout_type"] == down["layout_type"], + up["layout_type"] == "text", + down["layout_type"] == "text", + up["layout_type"] == "table", + down["layout_type"] == "table", + True if re.search(r"([。?!;!?;+))]|[a-z]\.)$", up["text"]) else False, + True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False, + True if re.search(r"(^.?[/,?;:\],。;:’”?!》】)-])", down["text"]) else False, + True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False, + True if re.search(r"[,,][^。.]+$", up["text"]) else False, + True if re.search(r"[,,][^。.]+$", up["text"]) else False, + True if re.search(r"[\((][^\))]+$", up["text"]) and re.search(r"[\))]", down["text"]) else False, + self._match_proj(down), + True if re.match(r"[A-Z]", down["text"]) else False, + True if re.match(r"[A-Z]", up["text"][-1]) else False, + True if re.match(r"[a-z0-9]", up["text"][-1]) else False, + True if re.match(r"[0-9.%,-]+$", down["text"]) else False, + up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()) > 1 and len(down["text"].strip()) > 1 else False, + up["x0"] > down["x1"], + abs(self.__height(up) - self.__height(down)) / min(self.__height(up), self.__height(down)), + self._x_dis(up, down) / max(w, 0.000001), + (len(up["text"]) - len(down["text"])) / max(len(up["text"]), len(down["text"])), + len(tks_all) - len(tks_up) - len(tks_down), + len(tks_down) - len(tks_up), + tks_down[-1] == tks_up[-1] if tks_down and tks_up else False, + max(down["in_row"], up["in_row"]), + abs(down["in_row"] - up["in_row"]), + len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0, + len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0, + ] + return fea + + @staticmethod + def sort_X_by_page(arr, threshold): + # sort using y1 first and then x1 + arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) + for i in range(len(arr) - 1): + for j in range(i, -1, -1): + # restore the order using th + if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold and arr[j + 1]["top"] < arr[j]["top"] and arr[j + 1]["page_number"] == arr[j]["page_number"]: + tmp = arr[j] + arr[j] = arr[j + 1] + arr[j + 1] = tmp + return arr + + def _has_color(self, o): + if o.get("ncs", "") == "DeviceGray": + if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and o["non_stroking_color"][0] == 1: + if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")): + return False + return True + + def _table_transformer_job(self, ZM): + logging.debug("Table processing...") + imgs, pos = [], [] + tbcnt = [0] + MARGIN = 10 + self.tb_cpns = [] + assert len(self.page_layout) == len(self.page_images) + for p, tbls in enumerate(self.page_layout): # for page + tbls = [f for f in tbls if f["type"] == "table"] + tbcnt.append(len(tbls)) + if not tbls: + continue + for tb in tbls: # for table + left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, tb["x1"] + MARGIN, tb["bottom"] + MARGIN + left *= ZM + top *= ZM + right *= ZM + bott *= ZM + pos.append((left, top)) + imgs.append(self.page_images[p].crop((left, top, right, bott))) + + assert len(self.page_images) == len(tbcnt) - 1 + if not imgs: + return + recos = self.tbl_det(imgs) + tbcnt = np.cumsum(tbcnt) + for i in range(len(tbcnt) - 1): # for page + pg = [] + for j, tb_items in enumerate(recos[tbcnt[i] : tbcnt[i + 1]]): # for table + poss = pos[tbcnt[i] : tbcnt[i + 1]] + for it in tb_items: # for table components + it["x0"] = it["x0"] + poss[j][0] + it["x1"] = it["x1"] + poss[j][0] + it["top"] = it["top"] + poss[j][1] + it["bottom"] = it["bottom"] + poss[j][1] + for n in ["x0", "x1", "top", "bottom"]: + it[n] /= ZM + it["top"] += self.page_cum_height[i] + it["bottom"] += self.page_cum_height[i] + it["pn"] = i + it["layoutno"] = j + pg.append(it) + self.tb_cpns.extend(pg) + + def gather(kwd, fzy=10, ption=0.6): + eles = Recognizer.sort_Y_firstly([r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy) + eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption) + return Recognizer.sort_Y_firstly(eles, 0) + + # add R,H,C,SP tag to boxes within table layout + headers = gather(r".*header$") + rows = gather(r".* (row|header)") + spans = gather(r".*spanning") + clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"])) + clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5) + for b in self.boxes: + if b.get("layout_type", "") != "table": + continue + ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3) + if ii is not None: + b["R"] = ii + b["R_top"] = rows[ii]["top"] + b["R_bott"] = rows[ii]["bottom"] + + ii = Recognizer.find_overlapped_with_threshold(b, headers, thr=0.3) + if ii is not None: + b["H_top"] = headers[ii]["top"] + b["H_bott"] = headers[ii]["bottom"] + b["H_left"] = headers[ii]["x0"] + b["H_right"] = headers[ii]["x1"] + b["H"] = ii + + ii = Recognizer.find_horizontally_tightest_fit(b, clmns) + if ii is not None: + b["C"] = ii + b["C_left"] = clmns[ii]["x0"] + b["C_right"] = clmns[ii]["x1"] + + ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3) + if ii is not None: + b["H_top"] = spans[ii]["top"] + b["H_bott"] = spans[ii]["bottom"] + b["H_left"] = spans[ii]["x0"] + b["H_right"] = spans[ii]["x1"] + b["SP"] = ii + + def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None): + start = timer() + bxs = self.ocr.detect(np.array(img), device_id) + logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)") + + start = timer() + if not bxs: + self.boxes.append([]) + return + bxs = [(line[0], line[1][0]) for line in bxs] + bxs = Recognizer.sort_Y_firstly( + [ + {"x0": b[0][0] / ZM, "x1": b[1][0] / ZM, "top": b[0][1] / ZM, "text": "", "txt": t, "bottom": b[-1][1] / ZM, "chars": [], "page_number": pagenum} + for b, t in bxs + if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1] + ], + self.mean_height[pagenum - 1] / 3, + ) + + # merge chars in the same rect + for c in chars: + ii = Recognizer.find_overlapped(c, bxs) + if ii is None: + self.lefted_chars.append(c) + continue + ch = c["bottom"] - c["top"] + bh = bxs[ii]["bottom"] - bxs[ii]["top"] + if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != " ": + self.lefted_chars.append(c) + continue + bxs[ii]["chars"].append(c) + + for b in bxs: + if not b["chars"]: + del b["chars"] + continue + m_ht = np.mean([c["height"] for c in b["chars"]]) + for c in Recognizer.sort_Y_firstly(b["chars"], m_ht): + if c["text"] == " " and b["text"]: + if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]): + b["text"] += " " + else: + b["text"] += c["text"] + del b["chars"] + + logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s") + start = timer() + boxes_to_reg = [] + img_np = np.array(img) + for b in bxs: + if not b["text"]: + left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM + b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32)) + boxes_to_reg.append(b) + del b["txt"] + texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id) + for i in range(len(boxes_to_reg)): + boxes_to_reg[i]["text"] = texts[i] + del boxes_to_reg[i]["box_image"] + logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s") + bxs = [b for b in bxs if b["text"]] + if self.mean_height[pagenum - 1] == 0: + self.mean_height[pagenum - 1] = np.median([b["bottom"] - b["top"] for b in bxs]) + self.boxes.append(bxs) + + def _layouts_rec(self, ZM, drop=True): + assert len(self.page_images) == len(self.boxes) + self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop) + # cumlative Y + for i in range(len(self.boxes)): + self.boxes[i]["top"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] + self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] + + def _text_merge(self): + # merge adjusted boxes + bxs = self.boxes + + def end_with(b, txt): + txt = txt.strip() + tt = b.get("text", "").strip() + return tt and tt.find(txt) == len(tt) - len(txt) + + def start_with(b, txts): + tt = b.get("text", "").strip() + return tt and any([tt.find(t.strip()) == 0 for t in txts]) + + # horizontally merge adjacent box with the same layout + i = 0 + while i < len(bxs) - 1: + b = bxs[i] + b_ = bxs[i + 1] + if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: + i += 1 + continue + if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3: + # merge + bxs[i]["x1"] = b_["x1"] + bxs[i]["top"] = (b["top"] + b_["top"]) / 2 + bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 + bxs[i]["text"] += b_["text"] + bxs.pop(i + 1) + continue + i += 1 + continue + + dis_thr = 1 + dis = b["x1"] - b_["x0"] + if b.get("layout_type", "") != "text" or b_.get("layout_type", "") != "text": + if end_with(b, ",") or start_with(b_, "(,"): + dis_thr = -8 + else: + i += 1 + continue + + if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 and dis >= dis_thr and b["x1"] < b_["x1"]: + # merge + bxs[i]["x1"] = b_["x1"] + bxs[i]["top"] = (b["top"] + b_["top"]) / 2 + bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 + bxs[i]["text"] += b_["text"] + bxs.pop(i + 1) + continue + i += 1 + self.boxes = bxs + + def _naive_vertical_merge(self, zoomin=3): + import math + bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) + + column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) + if not column_width or math.isnan(column_width): + column_width = self.mean_width[0] + self.column_num = int(self.page_images[0].size[0] / zoomin / column_width) + if column_width < self.page_images[0].size[0] / zoomin / self.column_num: + logging.info("Multi-column................... {} {}".format(column_width, self.page_images[0].size[0] / zoomin / self.column_num)) + self.boxes = self.sort_X_by_page(self.boxes, column_width / self.column_num) + + i = 0 + while i + 1 < len(bxs): + b = bxs[i] + b_ = bxs[i + 1] + if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): + bxs.pop(i) + continue + if not b["text"].strip(): + bxs.pop(i) + continue + concatting_feats = [ + b["text"].strip()[-1] in ",;:'\",、‘“;:-", + len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:", + b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:", + ] + # features for not concating + feats = [ + b.get("layoutno", 0) != b_.get("layoutno", 0), + b["text"].strip()[-1] in "。?!?", + self.is_english and b["text"].strip()[-1] in ".!?", + b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, + b["page_number"] < b_["page_number"] and abs(b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4, + ] + # split features + detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]] + if (any(feats) and not any(concatting_feats)) or any(detach_feats): + logging.debug( + "{} {} {} {}".format( + b["text"], + b_["text"], + any(feats), + any(concatting_feats), + ) + ) + i += 1 + continue + # merge up and down + b["bottom"] = b_["bottom"] + b["text"] += b_["text"] + b["x0"] = min(b["x0"], b_["x0"]) + b["x1"] = max(b["x1"], b_["x1"]) + bxs.pop(i + 1) + self.boxes = bxs + + def _concat_downward(self, concat_between_pages=True): + self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0) + return + + # count boxes in the same row as a feature + for i in range(len(self.boxes)): + mh = self.mean_height[self.boxes[i]["page_number"] - 1] + self.boxes[i]["in_row"] = 0 + j = max(0, i - 12) + while j < min(i + 12, len(self.boxes)): + if j == i: + j += 1 + continue + ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh + if abs(ydis) < 1: + self.boxes[i]["in_row"] += 1 + elif ydis > 0: + break + j += 1 + + # concat between rows + boxes = deepcopy(self.boxes) + blocks = [] + while boxes: + chunks = [] + + def dfs(up, dp): + chunks.append(up) + i = dp + while i < min(dp + 12, len(boxes)): + ydis = self._y_dis(up, boxes[i]) + smpg = up["page_number"] == boxes[i]["page_number"] + mh = self.mean_height[up["page_number"] - 1] + mw = self.mean_width[up["page_number"] - 1] + if smpg and ydis > mh * 4: + break + if not smpg and ydis > mh * 16: + break + down = boxes[i] + if not concat_between_pages and down["page_number"] > up["page_number"]: + break + + if up.get("R", "") != down.get("R", "") and up["text"][-1] != ",": + i += 1 + continue + + if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) or not down["text"].strip(): + i += 1 + continue + + if not down["text"].strip() or not up["text"].strip(): + i += 1 + continue + + if up["x1"] < down["x0"] - 10 * mw or up["x0"] > down["x1"] + 10 * mw: + i += 1 + continue + + if i - dp < 5 and up.get("layout_type") == "text": + if up.get("layoutno", "1") == down.get("layoutno", "2"): + dfs(down, i + 1) + boxes.pop(i) + return + i += 1 + continue + + fea = self._updown_concat_features(up, down) + if self.updown_cnt_mdl.predict(xgb.DMatrix([fea]))[0] <= 0.5: + i += 1 + continue + dfs(down, i + 1) + boxes.pop(i) + return + + dfs(boxes[0], 1) + boxes.pop(0) + if chunks: + blocks.append(chunks) + + # concat within each block + boxes = [] + for b in blocks: + if len(b) == 1: + boxes.append(b[0]) + continue + t = b[0] + for c in b[1:]: + t["text"] = t["text"].strip() + c["text"] = c["text"].strip() + if not c["text"]: + continue + if t["text"] and re.match(r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]): + t["text"] += " " + t["text"] += c["text"] + t["x0"] = min(t["x0"], c["x0"]) + t["x1"] = max(t["x1"], c["x1"]) + t["page_number"] = min(t["page_number"], c["page_number"]) + t["bottom"] = c["bottom"] + if not t["layout_type"] and c["layout_type"]: + t["layout_type"] = c["layout_type"] + boxes.append(t) + + self.boxes = Recognizer.sort_Y_firstly(boxes, 0) + + def _filter_forpages(self): + if not self.boxes: + return + findit = False + i = 0 + while i < len(self.boxes): + if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): + i += 1 + continue + findit = True + eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) + self.boxes.pop(i) + if i >= len(self.boxes): + break + prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2]) + while not prefix: + self.boxes.pop(i) + if i >= len(self.boxes): + break + prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split()[:2]) + self.boxes.pop(i) + if i >= len(self.boxes) or not prefix: + break + for j in range(i, min(i + 128, len(self.boxes))): + if not re.match(prefix, self.boxes[j]["text"]): + continue + for k in range(i, j): + self.boxes.pop(i) + break + if findit: + return + + page_dirty = [0] * len(self.page_images) + for b in self.boxes: + if re.search(r"(··|··|··)", b["text"]): + page_dirty[b["page_number"] - 1] += 1 + page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3]) + if not page_dirty: + return + i = 0 + while i < len(self.boxes): + if self.boxes[i]["page_number"] in page_dirty: + self.boxes.pop(i) + continue + i += 1 + + def _merge_with_same_bullet(self): + i = 0 + while i + 1 < len(self.boxes): + b = self.boxes[i] + b_ = self.boxes[i + 1] + if not b["text"].strip(): + self.boxes.pop(i) + continue + if not b_["text"].strip(): + self.boxes.pop(i + 1) + continue + + if ( + b["text"].strip()[0] != b_["text"].strip()[0] + or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") + or rag_tokenizer.is_chinese(b["text"].strip()[0]) + or b["top"] > b_["bottom"] + ): + i += 1 + continue + b_["text"] = b["text"] + "\n" + b_["text"] + b_["x0"] = min(b["x0"], b_["x0"]) + b_["x1"] = max(b["x1"], b_["x1"]) + b_["top"] = b["top"] + self.boxes.pop(i) + + def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False): + tables = {} + figures = {} + # extract figure and table boxes + i = 0 + lst_lout_no = "" + nomerge_lout_no = [] + while i < len(self.boxes): + if "layoutno" not in self.boxes[i]: + i += 1 + continue + lout_no = str(self.boxes[i]["page_number"]) + "-" + str(self.boxes[i]["layoutno"]) + if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title", "figure caption", "reference"]: + nomerge_lout_no.append(lst_lout_no) + if self.boxes[i]["layout_type"] == "table": + if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): + self.boxes.pop(i) + continue + if lout_no not in tables: + tables[lout_no] = [] + tables[lout_no].append(self.boxes[i]) + self.boxes.pop(i) + lst_lout_no = lout_no + continue + if need_image and self.boxes[i]["layout_type"] == "figure": + if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): + self.boxes.pop(i) + continue + if lout_no not in figures: + figures[lout_no] = [] + figures[lout_no].append(self.boxes[i]) + self.boxes.pop(i) + lst_lout_no = lout_no + continue + i += 1 + + # merge table on different pages + nomerge_lout_no = set(nomerge_lout_no) + tbls = sorted([(k, bxs) for k, bxs in tables.items()], key=lambda x: (x[1][0]["top"], x[1][0]["x0"])) + + i = len(tbls) - 1 + while i - 1 >= 0: + k0, bxs0 = tbls[i - 1] + k, bxs = tbls[i] + i -= 1 + if k0 in nomerge_lout_no: + continue + if bxs[0]["page_number"] == bxs0[0]["page_number"]: + continue + if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1: + continue + mh = self.mean_height[bxs[0]["page_number"] - 1] + if self._y_dis(bxs0[-1], bxs[0]) > mh * 23: + continue + tables[k0].extend(tables[k]) + del tables[k] + + def x_overlapped(a, b): + return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]]) + + # find captions and pop out + i = 0 + while i < len(self.boxes): + c = self.boxes[i] + # mh = self.mean_height[c["page_number"]-1] + if not TableStructureRecognizer.is_caption(c): + i += 1 + continue + + # find the nearest layouts + def nearest(tbls): + nonlocal c + mink = "" + minv = 1000000000 + for k, bxs in tbls.items(): + for b in bxs: + if b.get("layout_type", "").find("caption") >= 0: + continue + y_dis = self._y_dis(c, b) + x_dis = self._x_dis(c, b) if not x_overlapped(c, b) else 0 + dis = y_dis * y_dis + x_dis * x_dis + if dis < minv: + mink = k + minv = dis + return mink, minv + + tk, tv = nearest(tables) + fk, fv = nearest(figures) + # if min(tv, fv) > 2000: + # i += 1 + # continue + if tv < fv and tk: + tables[tk].insert(0, c) + logging.debug("TABLE:" + self.boxes[i]["text"] + "; Cap: " + tk) + elif fk: + figures[fk].insert(0, c) + logging.debug("FIGURE:" + self.boxes[i]["text"] + "; Cap: " + tk) + self.boxes.pop(i) + + def cropout(bxs, ltype, poss): + nonlocal ZM + pn = set([b["page_number"] - 1 for b in bxs]) + if len(pn) < 2: + pn = list(pn)[0] + ht = self.page_cum_height[pn] + b = {"x0": np.min([b["x0"] for b in bxs]), "top": np.min([b["top"] for b in bxs]) - ht, "x1": np.max([b["x1"] for b in bxs]), "bottom": np.max([b["bottom"] for b in bxs]) - ht} + louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype] + ii = Recognizer.find_overlapped(b, louts, naive=True) + if ii is not None: + b = louts[ii] + else: + logging.warning(f"Missing layout match: {pn + 1},%s" % (bxs[0].get("layoutno", ""))) + + left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] + if right < left: + right = left + 1 + poss.append((pn + self.page_from, left, right, top, bott)) + return self.page_images[pn].crop((left * ZM, top * ZM, right * ZM, bott * ZM)) + pn = {} + for b in bxs: + p = b["page_number"] - 1 + if p not in pn: + pn[p] = [] + pn[p].append(b) + pn = sorted(pn.items(), key=lambda x: x[0]) + imgs = [cropout(arr, ltype, poss) for p, arr in pn] + pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245)) + height = 0 + for img in imgs: + pic.paste(img, (0, int(height))) + height += img.size[1] + return pic + + res = [] + positions = [] + figure_results = [] + figure_positions = [] + # crop figure out and add caption + for k, bxs in figures.items(): + txt = "\n".join([b["text"] for b in bxs]) + if not txt: + continue + + poss = [] + + if separate_tables_figures: + figure_results.append((cropout(bxs, "figure", poss), [txt])) + figure_positions.append(poss) + else: + res.append((cropout(bxs, "figure", poss), [txt])) + positions.append(poss) + + for k, bxs in tables.items(): + if not bxs: + continue + bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs])) + + poss = [] + + res.append((cropout(bxs, "table", poss), self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english))) + positions.append(poss) + + if separate_tables_figures: + assert len(positions) + len(figure_positions) == len(res) + len(figure_results) + if need_position: + return list(zip(res, positions)), list(zip(figure_results, figure_positions)) + else: + return res, figure_results + else: + assert len(positions) == len(res) + if need_position: + return list(zip(res, positions)) + else: + return res + + def proj_match(self, line): + if len(line) <= 2: + return + if re.match(r"[0-9 ().,%%+/-]+$", line): + return False + for p, j in [ + (r"第[零一二三四五六七八九十百]+章", 1), + (r"第[零一二三四五六七八九十百]+[条节]", 2), + (r"[零一二三四五六七八九十百]+[、  ]", 3), + (r"[\((][零一二三四五六七八九十百]+[)\)]", 4), + (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5), + (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6), + (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7), + (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8), + (r".{,48}[::??]$", 9), + (r"[0-9]+)", 10), + (r"[\((][0-9]+[)\)]", 11), + (r"[零一二三四五六七八九十百]+是", 12), + (r"[⚫•➢✓]", 12), + ]: + if re.match(p, line): + return j + return + + def _line_tag(self, bx, ZM): + pn = [bx["page_number"]] + top = bx["top"] - self.page_cum_height[pn[0] - 1] + bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] + page_images_cnt = len(self.page_images) + if pn[-1] - 1 >= page_images_cnt: + return "" + while bott * ZM > self.page_images[pn[-1] - 1].size[1]: + bott -= self.page_images[pn[-1] - 1].size[1] / ZM + pn.append(pn[-1] + 1) + if pn[-1] - 1 >= page_images_cnt: + return "" + + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), bx["x0"], bx["x1"], top, bott) + + def __filterout_scraps(self, boxes, ZM): + def width(b): + return b["x1"] - b["x0"] + + def height(b): + return b["bottom"] - b["top"] + + def usefull(b): + if b.get("layout_type"): + return True + if width(b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3: + return True + if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]: + return True + return False + + res = [] + while boxes: + lines = [] + widths = [] + pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM + mh = self.mean_height[boxes[0]["page_number"] - 1] + mj = self.proj_match(boxes[0]["text"]) or boxes[0].get("layout_type", "") == "title" + + def dfs(line, st): + nonlocal mh, pw, lines, widths + lines.append(line) + widths.append(width(line)) + mmj = self.proj_match(line["text"]) or line.get("layout_type", "") == "title" + for i in range(st + 1, min(st + 20, len(boxes))): + if (boxes[i]["page_number"] - line["page_number"]) > 0: + break + if not mmj and self._y_dis(line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh: + break + + if not usefull(boxes[i]): + continue + if mmj or (self._x_dis(boxes[i], line) < pw / 10): + # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5): + # concat following + dfs(boxes[i], i) + boxes.pop(i) + break + + try: + if usefull(boxes[0]): + dfs(boxes[0], 0) + else: + logging.debug("WASTE: " + boxes[0]["text"]) + except Exception: + pass + boxes.pop(0) + mw = np.mean(widths) + if mj or mw / pw >= 0.35 or mw > 200: + res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines])) + else: + logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines])) + + return "\n\n".join(res) + + @staticmethod + def total_page_number(fnm, binary=None): + try: + with sys.modules[LOCK_KEY_pdfplumber]: + pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary)) + total_page = len(pdf.pages) + pdf.close() + return total_page + except Exception: + logging.exception("total_page_number") + + def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): + self.lefted_chars = [] + self.mean_height = [] + self.mean_width = [] + self.boxes = [] + self.garbages = {} + self.page_cum_height = [0] + self.page_layout = [] + self.page_from = page_from + start = timer() + try: + with sys.modules[LOCK_KEY_pdfplumber]: + with pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) as pdf: + self.pdf = pdf + self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] + + try: + self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] + except Exception as e: + logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}") + self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead. + + self.total_page = len(self.pdf.pages) + + except Exception: + logging.exception("RAGFlowPdfParser __images__") + logging.info(f"__images__ dedupe_chars cost {timer() - start}s") + + self.outlines = [] + try: + with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf: + self.pdf = pdf + + outlines = self.pdf.outline + + def dfs(arr, depth): + for a in arr: + if isinstance(a, dict): + self.outlines.append((a["/Title"], depth)) + continue + dfs(a, depth + 1) + + dfs(outlines, 0) + + except Exception as e: + logging.warning(f"Outlines exception: {e}") + + if not self.outlines: + logging.warning("Miss outlines") + + logging.debug("Images converted.") + self.is_english = [ + re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) + for i in range(len(self.page_chars)) + ] + if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: + self.is_english = True + else: + self.is_english = False + + async def __img_ocr(i, id, img, chars, limiter): + j = 0 + while j + 1 < len(chars): + if ( + chars[j]["text"] + and chars[j + 1]["text"] + and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) + and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"], chars[j]["width"]) / 2 + ): + chars[j]["text"] += " " + j += 1 + + if limiter: + async with limiter: + await trio.to_thread.run_sync(lambda: self.__ocr(i + 1, img, chars, zoomin, id)) + else: + self.__ocr(i + 1, img, chars, zoomin, id) + + if callback and i % 6 == 5: + callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") + + async def __img_ocr_launcher(): + def __ocr_preprocess(): + chars = self.page_chars[i] if not self.is_english else [] + self.mean_height.append(np.median(sorted([c["height"] for c in chars])) if chars else 0) + self.mean_width.append(np.median(sorted([c["width"] for c in chars])) if chars else 8) + self.page_cum_height.append(img.size[1] / zoomin) + return chars + + if self.parallel_limiter: + async with trio.open_nursery() as nursery: + for i, img in enumerate(self.page_images): + chars = __ocr_preprocess() + + nursery.start_soon(__img_ocr, i, i % PARALLEL_DEVICES, img, chars, self.parallel_limiter[i % PARALLEL_DEVICES]) + await trio.sleep(0.1) + else: + for i, img in enumerate(self.page_images): + chars = __ocr_preprocess() + await __img_ocr(i, 0, img, chars, None) + + start = timer() + + trio.run(__img_ocr_launcher) + + logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s") + + if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: + bxes = [b for bxs in self.boxes for b in bxs] + self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) + + logging.debug("Is it English:", self.is_english) + + self.page_cum_height = np.cumsum(self.page_cum_height) + assert len(self.page_cum_height) == len(self.page_images) + 1 + if len(self.boxes) == 0 and zoomin < 9: + self.__images__(fnm, zoomin * 3, page_from, page_to, callback) + + def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): + self.__images__(fnm, zoomin) + self._layouts_rec(zoomin) + self._table_transformer_job(zoomin) + self._text_merge() + self._concat_downward() + self._filter_forpages() + tbls = self._extract_table_figure(need_image, zoomin, return_html, False) + return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls + + def parse_into_bboxes(self, fnm, callback=None, zoomin=3): + start = timer() + self.__images__(fnm, zoomin) + if callback: + callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start)) + + start = timer() + self._layouts_rec(zoomin) + if callback: + callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) + + start = timer() + self._table_transformer_job(zoomin) + if callback: + callback(0.83, "Table analysis ({:.2f}s)".format(timer() - start)) + + start = timer() + self._text_merge() + self._concat_downward() + self._naive_vertical_merge(zoomin) + if callback: + callback(0.92, "Text merged ({:.2f}s)".format(timer() - start)) + + start = timer() + tbls, figs = self._extract_table_figure(True, zoomin, True, True, True) + + def insert_table_figures(tbls_or_figs, layout_type): + def min_rectangle_distance(rect1, rect2): + import math + pn1, left1, right1, top1, bottom1 = rect1 + pn2, left2, right2, top2, bottom2 = rect2 + if right1 >= left2 and right2 >= left1 and bottom1 >= top2 and bottom2 >= top1: + return 0 + if right1 < left2: + dx = left2 - right1 + elif right2 < left1: + dx = left1 - right2 + else: + dx = 0 + if bottom1 < top2: + dy = top2 - bottom1 + elif bottom2 < top1: + dy = top1 - bottom2 + else: + dy = 0 + return math.sqrt(dx*dx + dy*dy)# + (pn2-pn1)*10000 + + for (img, txt), poss in tbls_or_figs: + bboxes = [(i, (b["page_number"], b["x0"], b["x1"], b["top"], b["bottom"])) for i, b in enumerate(self.boxes)] + dists = [(min_rectangle_distance((pn, left, right, top+self.page_cum_height[pn], bott+self.page_cum_height[pn]), rect),i) for i, rect in bboxes for pn, left, right, top, bott in poss] + min_i = np.argmin(dists, axis=0)[0] + min_i, rect = bboxes[dists[min_i][-1]] + if isinstance(txt, list): + txt = "\n".join(txt) + pn, left, right, top, bott = poss[0] + if self.boxes[min_i]["bottom"] < top+self.page_cum_height[pn]: + min_i += 1 + self.boxes.insert(min_i, { + "page_number": pn+1, "x0": left, "x1": right, "top": top+self.page_cum_height[pn], "bottom": bott+self.page_cum_height[pn], "layout_type": layout_type, "text": txt, "image": img, + "positions": [[pn+1, int(left), int(right), int(top), int(bott)]] + }) + + for b in self.boxes: + b["position_tag"] = self._line_tag(b, zoomin) + b["image"] = self.crop(b["position_tag"], zoomin) + b["positions"] = [[pos[0][-1]+1, *pos[1:]] for pos in SimplePdfParser.extract_positions(b["position_tag"])] + + insert_table_figures(tbls, "table") + insert_table_figures(figs, "figure") + if callback: + callback(1, "Structured ({:.2f}s)".format(timer() - start)) + return deepcopy(self.boxes) + + @staticmethod + def remove_tag(txt): + return re.sub(r"@@[\t0-9.-]+?##", "", txt) + + @staticmethod + def extract_positions(txt): + poss = [] + for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt): + pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t") + left, right, top, bottom = float(left), float(right), float(top), float(bottom) + poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) + return poss + + def crop(self, text, ZM=3, need_position=False): + imgs = [] + poss = self.extract_positions(text) + if not poss: + if need_position: + return None, None + return + + max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) + GAP = 6 + pos = poss[0] + poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) + pos = poss[-1] + poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120))) + + positions = [] + for ii, (pns, left, right, top, bottom) in enumerate(poss): + right = left + max_width + bottom *= ZM + for pn in pns[1:]: + bottom += self.page_images[pn - 1].size[1] + imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1])))) + if 0 < ii < len(poss) - 1: + positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM)) + bottom -= self.page_images[pns[0]].size[1] + for pn in pns[1:]: + imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1])))) + if 0 < ii < len(poss) - 1: + positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM)) + bottom -= self.page_images[pn].size[1] + + if not imgs: + if need_position: + return None, None + return + height = 0 + for img in imgs: + height += img.size[1] + GAP + height = int(height) + width = int(np.max([i.size[0] for i in imgs])) + pic = Image.new("RGB", (width, height), (245, 245, 245)) + height = 0 + for ii, img in enumerate(imgs): + if ii == 0 or ii + 1 == len(imgs): + img = img.convert("RGBA") + overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) + overlay.putalpha(128) + img = Image.alpha_composite(img, overlay).convert("RGB") + pic.paste(img, (0, int(height))) + height += img.size[1] + GAP + + if need_position: + return pic, positions + return pic + + def get_position(self, bx, ZM): + poss = [] + pn = bx["page_number"] + top = bx["top"] - self.page_cum_height[pn - 1] + bott = bx["bottom"] - self.page_cum_height[pn - 1] + poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM))) + while bott * ZM > self.page_images[pn - 1].size[1]: + bott -= self.page_images[pn - 1].size[1] / ZM + top = 0 + pn += 1 + poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM))) + return poss + + +class PlainParser: + def __call__(self, filename, from_page=0, to_page=100000, **kwargs): + self.outlines = [] + lines = [] + try: + self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename)) + for page in self.pdf.pages[from_page:to_page]: + lines.extend([t for t in page.extract_text().split("\n")]) + + outlines = self.pdf.outline + + def dfs(arr, depth): + for a in arr: + if isinstance(a, dict): + self.outlines.append((a["/Title"], depth)) + continue + dfs(a, depth + 1) + + dfs(outlines, 0) + except Exception: + logging.exception("Outlines exception") + if not self.outlines: + logging.warning("Miss outlines") + + return [(line, "") for line in lines], [] + + def crop(self, ck, need_position): + raise NotImplementedError + + @staticmethod + def remove_tag(txt): + raise NotImplementedError + + +class VisionParser(SimplePdfParser): + def __init__(self, vision_model, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vision_model = vision_model + + def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): + try: + with sys.modules[LOCK_KEY_pdfplumber]: + self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) + self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] + self.total_page = len(self.pdf.pages) + except Exception: + self.page_images = None + self.total_page = 0 + logging.exception("VisionParser __images__") + + def __call__(self, filename, from_page=0, to_page=100000, **kwargs): + callback = kwargs.get("callback", lambda prog, msg: None) + zoomin = kwargs.get("zoomin", 3) + self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback) + + total_pdf_pages = self.total_page + + start_page = max(0, from_page) + end_page = min(to_page, total_pdf_pages) + + all_docs = [] + + for idx, img_binary in enumerate(self.page_images or []): + pdf_page_num = idx # 0-based + if pdf_page_num < start_page or pdf_page_num >= end_page: + continue + + text = picture_vision_llm_chunk( + binary=img_binary, + vision_model=self.vision_model, + prompt=vision_llm_describe_prompt(page=pdf_page_num + 1), + callback=callback, + ) + if kwargs.get("callback"): + kwargs["callback"](idx * 1.0 / len(self.page_images), f"Processed: {idx + 1}/{len(self.page_images)}") + + if text: + width, height = self.page_images[idx].size + all_docs.append((text, f"{pdf_page_num + 1} 0 {width / zoomin} 0 {height / zoomin}")) + return all_docs, [] + + +# 向后兼容的别名 +PdfParser = SimplePdfParser