优化OCR解析

This commit is contained in:
2025-11-03 10:22:28 +08:00
parent 4603a86df4
commit 3e58c3d0e9
9 changed files with 581 additions and 30 deletions

View File

@@ -19,7 +19,7 @@ import trio
from api.utils import get_uuid
from api.utils.base64_image import id2image, image2id
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from ocr.service import get_ocr_service
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.splitter.schema import SplitterFromUpstream
from rag.nlp import naive_merge, naive_merge_with_images
@@ -96,14 +96,18 @@ class Splitter(ProcessBase):
deli,
self._param.overlapped_percent,
)
cks = [
{
"text": RAGFlowPdfParser.remove_tag(c),
ocr_service = get_ocr_service()
cks = []
for c, img in zip(chunks, images):
if not c.strip():
continue
cleaned_text = await ocr_service.remove_tag(c)
positions = await ocr_service.extract_positions(c)
cks.append({
"text": cleaned_text,
"image": img,
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
}
for c, img in zip(chunks, images) if c.strip()
]
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in positions],
})
async with trio.open_nursery() as nursery:
for d in cks:
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())