优化OCR解析

This commit is contained in:
2025-11-03 10:22:28 +08:00
parent 4603a86df4
commit 3e58c3d0e9
9 changed files with 581 additions and 30 deletions

View File

@@ -29,7 +29,8 @@ from api.db.services.llm_service import LLMBundle
from api.utils import get_uuid
from api.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from ocr.service import get_ocr_service
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
@@ -204,7 +205,9 @@ class Parser(ProcessBase):
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method").lower() == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
# 注意HTTP 调用中无法传递 callbackcallback 将被忽略
ocr_service = get_ocr_service()
bboxes = ocr_service.parse_into_bboxes_sync(blob, callback=self.callback, filename=name)
elif conf.get("parse_method").lower() == "plain_text":
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]