v0.21.1-fastapi
This commit is contained in:
@@ -22,7 +22,7 @@ import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from ocr.service import get_ocr_service
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
|
||||
from rag.nlp import concat_img
|
||||
@@ -166,24 +166,21 @@ class HierarchicalMerger(ProcessBase):
|
||||
img = None
|
||||
for i in path:
|
||||
txt += lines[i] + "\n"
|
||||
concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
|
||||
concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))
|
||||
cks.append(txt)
|
||||
images.append(img)
|
||||
|
||||
ocr_service = get_ocr_service()
|
||||
processed_cks = []
|
||||
for c, img in zip(cks, images):
|
||||
cleaned_text = await ocr_service.remove_tag(c)
|
||||
positions = await ocr_service.extract_positions(c)
|
||||
processed_cks.append({
|
||||
"text": cleaned_text,
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": positions,
|
||||
})
|
||||
cks = processed_cks
|
||||
"positions": RAGFlowPdfParser.extract_positions(c),
|
||||
}
|
||||
for c, img in zip(cks, images)
|
||||
]
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
|
||||
self.set_output("chunks", cks)
|
||||
|
||||
self.callback(1, "Done.")
|
||||
|
||||
@@ -29,8 +29,8 @@ from api.db.services.llm_service import LLMBundle
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import image2id
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from ocr.service import get_ocr_service
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from rag.app.naive import Docx
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.schema import ParserFromUpstream
|
||||
@@ -53,6 +53,7 @@ class ParserParam(ProcessParamBase):
|
||||
],
|
||||
"word": [
|
||||
"json",
|
||||
"markdown",
|
||||
],
|
||||
"slides": [
|
||||
"json",
|
||||
@@ -138,9 +139,16 @@ class ParserParam(ProcessParamBase):
|
||||
"oggvorbis",
|
||||
"ape"
|
||||
],
|
||||
"output_format": "json",
|
||||
"output_format": "text",
|
||||
},
|
||||
"video": {
|
||||
"suffix":[
|
||||
"mp4",
|
||||
"avi",
|
||||
"mkv"
|
||||
],
|
||||
"output_format": "text",
|
||||
},
|
||||
"video": {},
|
||||
}
|
||||
|
||||
def check(self):
|
||||
@@ -149,7 +157,7 @@ class ParserParam(ProcessParamBase):
|
||||
pdf_parse_method = pdf_config.get("parse_method", "")
|
||||
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
||||
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
|
||||
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
||||
|
||||
pdf_output_format = pdf_config.get("output_format", "")
|
||||
@@ -184,8 +192,10 @@ class ParserParam(ProcessParamBase):
|
||||
audio_config = self.setups.get("audio", "")
|
||||
if audio_config:
|
||||
self.check_empty(audio_config.get("llm_id"), "Audio VLM")
|
||||
audio_language = audio_config.get("lang", "")
|
||||
self.check_empty(audio_language, "Language")
|
||||
|
||||
video_config = self.setups.get("video", "")
|
||||
if video_config:
|
||||
self.check_empty(video_config.get("llm_id"), "Video VLM")
|
||||
|
||||
email_config = self.setups.get("email", "")
|
||||
if email_config:
|
||||
@@ -205,19 +215,38 @@ class Parser(ProcessBase):
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if conf.get("parse_method").lower() == "deepdoc":
|
||||
# 注意:HTTP 调用中无法传递 callback,callback 将被忽略
|
||||
ocr_service = get_ocr_service()
|
||||
bboxes = ocr_service.parse_into_bboxes_sync(blob, callback=self.callback, filename=name)
|
||||
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
|
||||
elif conf.get("parse_method").lower() == "plain_text":
|
||||
lines, _ = PlainParser()(blob)
|
||||
bboxes = [{"text": t} for t, _ in lines]
|
||||
elif conf.get("parse_method").lower() == "mineru":
|
||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
pdf_parser = MinerUParser(mineru_path=mineru_executable)
|
||||
if not pdf_parser.check_installation():
|
||||
raise RuntimeError("MinerU not found. Please install it via: pip install -U 'mineru[core]'.")
|
||||
|
||||
lines, _ = pdf_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
box = {
|
||||
"image": pdf_parser.crop(poss, 1),
|
||||
"positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)],
|
||||
"text": t,
|
||||
}
|
||||
bboxes.append(box)
|
||||
else:
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
|
||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
pn, x0, x1, top, bott = poss.split(" ")
|
||||
bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
|
||||
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
|
||||
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", bboxes)
|
||||
@@ -250,13 +279,15 @@ class Parser(ProcessBase):
|
||||
conf = self._param.setups["word"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
docx_parser = Docx()
|
||||
sections, tbls = docx_parser(name, binary=blob)
|
||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for doc"
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
sections, tbls = docx_parser(name, binary=blob)
|
||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
||||
self.set_output("json", sections)
|
||||
elif conf.get("output_format") == "markdown":
|
||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||
self.set_output("markdown", markdown_text)
|
||||
|
||||
def _slides(self, name, blob):
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
@@ -348,24 +379,34 @@ class Parser(ProcessBase):
|
||||
|
||||
conf = self._param.setups["audio"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
lang = conf["lang"]
|
||||
_, ext = os.path.splitext(name)
|
||||
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
|
||||
tmpf.write(blob)
|
||||
tmpf.flush()
|
||||
tmp_path = os.path.abspath(tmpf.name)
|
||||
|
||||
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
|
||||
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT)
|
||||
txt = seq2txt_mdl.transcription(tmp_path)
|
||||
|
||||
self.set_output("text", txt)
|
||||
|
||||
def _video(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.")
|
||||
|
||||
conf = self._param.setups["video"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
cv_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT)
|
||||
txt = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name)
|
||||
|
||||
self.set_output("text", txt)
|
||||
|
||||
def _email(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
|
||||
|
||||
email_content = {}
|
||||
conf = self._param.setups["email"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
target_fields = conf["fields"]
|
||||
|
||||
_, ext = os.path.splitext(name)
|
||||
@@ -403,8 +444,8 @@ class Parser(ProcessBase):
|
||||
|
||||
_add_content(msg, msg.get_content_type())
|
||||
|
||||
email_content["text"] = body_text
|
||||
email_content["text_html"] = body_html
|
||||
email_content["text"] = "\n".join(body_text)
|
||||
email_content["text_html"] = "\n".join(body_html)
|
||||
# get attachment
|
||||
if "attachments" in target_fields:
|
||||
attachments = []
|
||||
@@ -414,7 +455,7 @@ class Parser(ProcessBase):
|
||||
dispositions = content_disposition.strip().split(";")
|
||||
if dispositions[0].lower() == "attachment":
|
||||
filename = part.get_filename()
|
||||
payload = part.get_payload(decode=True)
|
||||
payload = part.get_payload(decode=True).decode(part.get_content_charset())
|
||||
attachments.append({
|
||||
"filename": filename,
|
||||
"payload": payload,
|
||||
@@ -442,15 +483,16 @@ class Parser(ProcessBase):
|
||||
}
|
||||
# get body
|
||||
if "body" in target_fields:
|
||||
email_content["text"] = msg.body # usually empty. try text_html instead
|
||||
email_content["text_html"] = msg.htmlBody
|
||||
email_content["text"] = msg.body[0] if isinstance(msg.body, list) and msg.body else msg.body
|
||||
if not email_content["text"] and msg.htmlBody:
|
||||
email_content["text"] = msg.htmlBody[0] if isinstance(msg.htmlBody, list) and msg.htmlBody else msg.htmlBody
|
||||
# get attachments
|
||||
if "attachments" in target_fields:
|
||||
attachments = []
|
||||
for t in msg.attachments:
|
||||
attachments.append({
|
||||
"filename": t.name,
|
||||
"payload": t.data # binary
|
||||
"payload": t.data.decode("utf-8")
|
||||
})
|
||||
email_content["attachments"] = attachments
|
||||
|
||||
@@ -485,6 +527,7 @@ class Parser(ProcessBase):
|
||||
"word": self._word,
|
||||
"image": self._image,
|
||||
"audio": self._audio,
|
||||
"video": self._video,
|
||||
"email": self._email,
|
||||
}
|
||||
try:
|
||||
@@ -514,4 +557,4 @@ class Parser(ProcessBase):
|
||||
outs = self.output()
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in outs.get("json", []):
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
|
||||
|
||||
@@ -25,7 +25,7 @@ class SplitterFromUpstream(BaseModel):
|
||||
file: dict | None = Field(default=None)
|
||||
chunks: list[dict[str, Any]] | None = Field(default=None)
|
||||
|
||||
output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
|
||||
output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)
|
||||
|
||||
json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
|
||||
markdown_result: str | None = Field(default=None, alias="markdown")
|
||||
|
||||
@@ -19,7 +19,7 @@ import trio
|
||||
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import id2image, image2id
|
||||
from ocr.service import get_ocr_service
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.splitter.schema import SplitterFromUpstream
|
||||
from rag.nlp import naive_merge, naive_merge_with_images
|
||||
@@ -87,7 +87,7 @@ class Splitter(ProcessBase):
|
||||
sections, section_images = [], []
|
||||
for o in from_upstream.json_result or []:
|
||||
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
||||
section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
|
||||
section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))
|
||||
|
||||
chunks, images = naive_merge_with_images(
|
||||
sections,
|
||||
@@ -96,20 +96,16 @@ class Splitter(ProcessBase):
|
||||
deli,
|
||||
self._param.overlapped_percent,
|
||||
)
|
||||
ocr_service = get_ocr_service()
|
||||
cks = []
|
||||
for c, img in zip(chunks, images):
|
||||
if not c.strip():
|
||||
continue
|
||||
cleaned_text = await ocr_service.remove_tag(c)
|
||||
positions = await ocr_service.extract_positions(c)
|
||||
cks.append({
|
||||
"text": cleaned_text,
|
||||
cks = [
|
||||
{
|
||||
"text": RAGFlowPdfParser.remove_tag(c),
|
||||
"image": img,
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in positions],
|
||||
})
|
||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
|
||||
}
|
||||
for c, img in zip(chunks, images) if c.strip()
|
||||
]
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in cks:
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
|
||||
self.set_output("chunks", cks)
|
||||
self.callback(1, "Done.")
|
||||
|
||||
@@ -126,7 +126,7 @@ class Tokenizer(ProcessBase):
|
||||
if ck.get("summary"):
|
||||
ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
|
||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||
else:
|
||||
elif ck.get("text"):
|
||||
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
|
||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||
if i % 100 == 99:
|
||||
@@ -155,6 +155,8 @@ class Tokenizer(ProcessBase):
|
||||
for i, ck in enumerate(chunks):
|
||||
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
|
||||
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||
if not ck.get("text"):
|
||||
continue
|
||||
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
|
||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||
if i % 100 == 99:
|
||||
|
||||
Reference in New Issue
Block a user