优化OCR解析
This commit is contained in:
@@ -19,7 +19,7 @@ import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from ocr.service import get_ocr_service
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.splitter.schema import SplitterFromUpstream
|
||||
from rag.nlp import naive_merge, naive_merge_with_images
|
||||
@@ -96,14 +96,18 @@ class Splitter(ProcessBase):
|
||||
deli,
|
||||
self._param.overlapped_percent,
|
||||
)
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
ocr_service = get_ocr_service()
|
||||
cks = []
|
||||
for c, img in zip(chunks, images):
|
||||
if not c.strip():
|
||||
continue
|
||||
cleaned_text = await ocr_service.remove_tag(c)
|
||||
positions = await ocr_service.extract_positions(c)
|
||||
cks.append({
|
||||
"text": cleaned_text,
|
||||
"image": img,
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
|
||||
}
|
||||
for c, img in zip(chunks, images) if c.strip()
|
||||
]
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in positions],
|
||||
})
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
|
||||
Reference in New Issue
Block a user