优化OCR解析
This commit is contained in:
@@ -22,7 +22,7 @@ import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from ocr.service import get_ocr_service
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
|
||||
from rag.nlp import concat_img
|
||||
@@ -170,14 +170,17 @@ class HierarchicalMerger(ProcessBase):
|
||||
cks.append(txt)
|
||||
images.append(img)
|
||||
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
ocr_service = get_ocr_service()
|
||||
processed_cks = []
|
||||
for c, img in zip(cks, images):
|
||||
cleaned_text = await ocr_service.remove_tag(c)
|
||||
positions = await ocr_service.extract_positions(c)
|
||||
processed_cks.append({
|
||||
"text": cleaned_text,
|
||||
"image": img,
|
||||
"positions": RAGFlowPdfParser.extract_positions(c),
|
||||
}
|
||||
for c, img in zip(cks, images)
|
||||
]
|
||||
"positions": positions,
|
||||
})
|
||||
cks = processed_cks
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
|
||||
@@ -29,7 +29,8 @@ from api.db.services.llm_service import LLMBundle
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import image2id
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from ocr.service import get_ocr_service
|
||||
from rag.app.naive import Docx
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.schema import ParserFromUpstream
|
||||
@@ -204,7 +205,9 @@ class Parser(ProcessBase):
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if conf.get("parse_method").lower() == "deepdoc":
|
||||
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
|
||||
# 注意:HTTP 调用中无法传递 callback,callback 将被忽略
|
||||
ocr_service = get_ocr_service()
|
||||
bboxes = ocr_service.parse_into_bboxes_sync(blob, callback=self.callback, filename=name)
|
||||
elif conf.get("parse_method").lower() == "plain_text":
|
||||
lines, _ = PlainParser()(blob)
|
||||
bboxes = [{"text": t} for t, _ in lines]
|
||||
|
||||
@@ -19,7 +19,7 @@ import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from ocr.service import get_ocr_service
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.splitter.schema import SplitterFromUpstream
|
||||
from rag.nlp import naive_merge, naive_merge_with_images
|
||||
@@ -96,14 +96,18 @@ class Splitter(ProcessBase):
|
||||
deli,
|
||||
self._param.overlapped_percent,
|
||||
)
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
ocr_service = get_ocr_service()
|
||||
cks = []
|
||||
for c, img in zip(chunks, images):
|
||||
if not c.strip():
|
||||
continue
|
||||
cleaned_text = await ocr_service.remove_tag(c)
|
||||
positions = await ocr_service.extract_positions(c)
|
||||
cks.append({
|
||||
"text": cleaned_text,
|
||||
"image": img,
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
|
||||
}
|
||||
for c, img in zip(chunks, images) if c.strip()
|
||||
]
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in positions],
|
||||
})
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
|
||||
@@ -578,7 +578,8 @@ def hierarchical_merge(bull, sections, depth):
|
||||
|
||||
|
||||
def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from ocr.service import get_ocr_service
|
||||
ocr_service = get_ocr_service()
|
||||
if not sections:
|
||||
return []
|
||||
if isinstance(sections, str):
|
||||
@@ -598,7 +599,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
||||
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
|
||||
if cks:
|
||||
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
|
||||
overlapped = ocr_service.remove_tag_sync(cks[-1])
|
||||
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
@@ -625,7 +626,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
|
||||
|
||||
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from ocr.service import get_ocr_service
|
||||
ocr_service = get_ocr_service()
|
||||
if not texts or len(texts) != len(images):
|
||||
return [], []
|
||||
cks = [""]
|
||||
@@ -642,7 +644,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
||||
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
|
||||
if cks:
|
||||
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
|
||||
overlapped = ocr_service.remove_tag_sync(cks[-1])
|
||||
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
|
||||
Reference in New Issue
Block a user