v0.21.1-fastapi

This commit is contained in:
2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions

View File

@@ -29,8 +29,8 @@ from api.db.services.llm_service import LLMBundle
from api.utils import get_uuid
from api.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from ocr.service import get_ocr_service
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
@@ -53,6 +53,7 @@ class ParserParam(ProcessParamBase):
],
"word": [
"json",
"markdown",
],
"slides": [
"json",
@@ -138,9 +139,16 @@ class ParserParam(ProcessParamBase):
"oggvorbis",
"ape"
],
"output_format": "json",
"output_format": "text",
},
"video": {
"suffix":[
"mp4",
"avi",
"mkv"
],
"output_format": "text",
},
"video": {},
}
def check(self):
@@ -149,7 +157,7 @@ class ParserParam(ProcessParamBase):
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_empty(pdf_parse_method, "Parse method abnormal.")
if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
pdf_output_format = pdf_config.get("output_format", "")
@@ -184,8 +192,10 @@ class ParserParam(ProcessParamBase):
audio_config = self.setups.get("audio", "")
if audio_config:
self.check_empty(audio_config.get("llm_id"), "Audio VLM")
audio_language = audio_config.get("lang", "")
self.check_empty(audio_language, "Language")
video_config = self.setups.get("video", "")
if video_config:
self.check_empty(video_config.get("llm_id"), "Video VLM")
email_config = self.setups.get("email", "")
if email_config:
@@ -205,19 +215,38 @@ class Parser(ProcessBase):
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method").lower() == "deepdoc":
# 注意HTTP 调用中无法传递 callbackcallback 将被忽略
ocr_service = get_ocr_service()
bboxes = ocr_service.parse_into_bboxes_sync(blob, callback=self.callback, filename=name)
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
elif conf.get("parse_method").lower() == "plain_text":
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]
elif conf.get("parse_method").lower() == "mineru":
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
pdf_parser = MinerUParser(mineru_path=mineru_executable)
if not pdf_parser.check_installation():
raise RuntimeError("MinerU not found. Please install it via: pip install -U 'mineru[core]'.")
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
bboxes = []
for t, poss in lines:
box = {
"image": pdf_parser.crop(poss, 1),
"positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)],
"text": t,
}
bboxes.append(box)
else:
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
bboxes = []
for t, poss in lines:
pn, x0, x1, top, bott = poss.split(" ")
bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
if conf.get("output_format") == "json":
self.set_output("json", bboxes)
@@ -250,13 +279,15 @@ class Parser(ProcessBase):
conf = self._param.setups["word"]
self.set_output("output_format", conf["output_format"])
docx_parser = Docx()
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
# json
assert conf.get("output_format") == "json", "have to be json for doc"
if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
self.set_output("json", sections)
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
self.set_output("markdown", markdown_text)
def _slides(self, name, blob):
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
@@ -348,24 +379,34 @@ class Parser(ProcessBase):
conf = self._param.setups["audio"]
self.set_output("output_format", conf["output_format"])
lang = conf["lang"]
_, ext = os.path.splitext(name)
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
tmpf.write(blob)
tmpf.flush()
tmp_path = os.path.abspath(tmpf.name)
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT)
txt = seq2txt_mdl.transcription(tmp_path)
self.set_output("text", txt)
def _video(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.")
conf = self._param.setups["video"]
self.set_output("output_format", conf["output_format"])
cv_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT)
txt = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name)
self.set_output("text", txt)
def _email(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
email_content = {}
conf = self._param.setups["email"]
self.set_output("output_format", conf["output_format"])
target_fields = conf["fields"]
_, ext = os.path.splitext(name)
@@ -403,8 +444,8 @@ class Parser(ProcessBase):
_add_content(msg, msg.get_content_type())
email_content["text"] = body_text
email_content["text_html"] = body_html
email_content["text"] = "\n".join(body_text)
email_content["text_html"] = "\n".join(body_html)
# get attachment
if "attachments" in target_fields:
attachments = []
@@ -414,7 +455,7 @@ class Parser(ProcessBase):
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True)
payload = part.get_payload(decode=True).decode(part.get_content_charset())
attachments.append({
"filename": filename,
"payload": payload,
@@ -442,15 +483,16 @@ class Parser(ProcessBase):
}
# get body
if "body" in target_fields:
email_content["text"] = msg.body # usually empty. try text_html instead
email_content["text_html"] = msg.htmlBody
email_content["text"] = msg.body[0] if isinstance(msg.body, list) and msg.body else msg.body
if not email_content["text"] and msg.htmlBody:
email_content["text"] = msg.htmlBody[0] if isinstance(msg.htmlBody, list) and msg.htmlBody else msg.htmlBody
# get attachments
if "attachments" in target_fields:
attachments = []
for t in msg.attachments:
attachments.append({
"filename": t.name,
"payload": t.data # binary
"payload": t.data.decode("utf-8")
})
email_content["attachments"] = attachments
@@ -485,6 +527,7 @@ class Parser(ProcessBase):
"word": self._word,
"image": self._image,
"audio": self._audio,
"video": self._video,
"email": self._email,
}
try:
@@ -514,4 +557,4 @@ class Parser(ProcessBase):
outs = self.output()
async with trio.open_nursery() as nursery:
for d in outs.get("json", []):
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())