优化OCR解析

This commit is contained in:
2025-11-03 10:22:28 +08:00
parent 4603a86df4
commit 3e58c3d0e9
9 changed files with 581 additions and 30 deletions

View File

@@ -22,7 +22,7 @@ import trio
from api.utils import get_uuid
from api.utils.base64_image import id2image, image2id
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from ocr.service import get_ocr_service
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
from rag.nlp import concat_img
@@ -170,14 +170,17 @@ class HierarchicalMerger(ProcessBase):
cks.append(txt)
images.append(img)
cks = [
{
"text": RAGFlowPdfParser.remove_tag(c),
ocr_service = get_ocr_service()
processed_cks = []
for c, img in zip(cks, images):
cleaned_text = await ocr_service.remove_tag(c)
positions = await ocr_service.extract_positions(c)
processed_cks.append({
"text": cleaned_text,
"image": img,
"positions": RAGFlowPdfParser.extract_positions(c),
}
for c, img in zip(cks, images)
]
"positions": positions,
})
cks = processed_cks
async with trio.open_nursery() as nursery:
for d in cks:
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())

View File

@@ -29,7 +29,8 @@ from api.db.services.llm_service import LLMBundle
from api.utils import get_uuid
from api.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from ocr.service import get_ocr_service
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
@@ -204,7 +205,9 @@ class Parser(ProcessBase):
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method").lower() == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
# 注意HTTP 调用中无法传递 callbackcallback 将被忽略
ocr_service = get_ocr_service()
bboxes = ocr_service.parse_into_bboxes_sync(blob, callback=self.callback, filename=name)
elif conf.get("parse_method").lower() == "plain_text":
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]

View File

@@ -19,7 +19,7 @@ import trio
from api.utils import get_uuid
from api.utils.base64_image import id2image, image2id
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from ocr.service import get_ocr_service
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.splitter.schema import SplitterFromUpstream
from rag.nlp import naive_merge, naive_merge_with_images
@@ -96,14 +96,18 @@ class Splitter(ProcessBase):
deli,
self._param.overlapped_percent,
)
cks = [
{
"text": RAGFlowPdfParser.remove_tag(c),
ocr_service = get_ocr_service()
cks = []
for c, img in zip(chunks, images):
if not c.strip():
continue
cleaned_text = await ocr_service.remove_tag(c)
positions = await ocr_service.extract_positions(c)
cks.append({
"text": cleaned_text,
"image": img,
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
}
for c, img in zip(chunks, images) if c.strip()
]
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in positions],
})
async with trio.open_nursery() as nursery:
for d in cks:
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())