优化OCR解析

This commit is contained in:
2025-11-03 10:22:28 +08:00
parent 4603a86df4
commit 3e58c3d0e9
9 changed files with 581 additions and 30 deletions

View File

@@ -22,7 +22,7 @@ import trio
from api.utils import get_uuid
from api.utils.base64_image import id2image, image2id
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from ocr.service import get_ocr_service
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
from rag.nlp import concat_img
@@ -170,14 +170,17 @@ class HierarchicalMerger(ProcessBase):
cks.append(txt)
images.append(img)
cks = [
{
"text": RAGFlowPdfParser.remove_tag(c),
ocr_service = get_ocr_service()
processed_cks = []
for c, img in zip(cks, images):
cleaned_text = await ocr_service.remove_tag(c)
positions = await ocr_service.extract_positions(c)
processed_cks.append({
"text": cleaned_text,
"image": img,
"positions": RAGFlowPdfParser.extract_positions(c),
}
for c, img in zip(cks, images)
]
"positions": positions,
})
cks = processed_cks
async with trio.open_nursery() as nursery:
for d in cks:
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())