v0.21.1-fastapi

2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@@ -22,7 +22,7 @@ import trio

 from api.utils import get_uuid
 from api.utils.base64_image import id2image, image2id
-from ocr.service import get_ocr_service
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
 from rag.nlp import concat_img
@@ -166,24 +166,21 @@ class HierarchicalMerger(ProcessBase):
                img = None
                for i in path:
                    txt += lines[i] + "\n"
-                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
+                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))
                cks.append(txt)
                images.append(img)

-            ocr_service = get_ocr_service()
-            processed_cks = []
-            for c, img in zip(cks, images):
-                cleaned_text = await ocr_service.remove_tag(c)
-                positions = await ocr_service.extract_positions(c)
-                processed_cks.append({
-                    "text": cleaned_text,
+            cks = [
+                {
+                    "text": RAGFlowPdfParser.remove_tag(c),
                    "image": img,
-                    "positions": positions,
-                })
-            cks = processed_cks
+                    "positions": RAGFlowPdfParser.extract_positions(c),
+                }
+                for c, img in zip(cks, images)
+            ]
            async with trio.open_nursery() as nursery:
                for d in cks:
-                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
            self.set_output("chunks", cks)

        self.callback(1, "Done.")
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -29,8 +29,8 @@ from api.db.services.llm_service import LLMBundle
 from api.utils import get_uuid
 from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
-from deepdoc.parser.pdf_parser import PlainParser, VisionParser
-from ocr.service import get_ocr_service
+from deepdoc.parser.mineru_parser import MinerUParser
+from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
 from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
@@ -53,6 +53,7 @@ class ParserParam(ProcessParamBase):
            ],
            "word": [
                "json",
+                "markdown",
            ],
            "slides": [
                "json",
@@ -138,9 +139,16 @@ class ParserParam(ProcessParamBase):
                    "oggvorbis",
                    "ape"
                ],
-                "output_format": "json",
+                "output_format": "text",
+            },
+            "video": {
+                "suffix":[
+                    "mp4",
+                    "avi",
+                    "mkv"
+                ],
+                "output_format": "text",
            },
-            "video": {},
        }

    def check(self):
@@ -149,7 +157,7 @@ class ParserParam(ProcessParamBase):
            pdf_parse_method = pdf_config.get("parse_method", "")
            self.check_empty(pdf_parse_method, "Parse method abnormal.")

-            if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
+            if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
                self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")

            pdf_output_format = pdf_config.get("output_format", "")
@@ -184,8 +192,10 @@ class ParserParam(ProcessParamBase):
        audio_config = self.setups.get("audio", "")
        if audio_config:
            self.check_empty(audio_config.get("llm_id"), "Audio VLM")
-            audio_language = audio_config.get("lang", "")
-            self.check_empty(audio_language, "Language")
+
+        video_config = self.setups.get("video", "")
+        if video_config:
+            self.check_empty(video_config.get("llm_id"), "Video VLM")

        email_config = self.setups.get("email", "")
        if email_config:
@@ -205,19 +215,38 @@ class Parser(ProcessBase):
        self.set_output("output_format", conf["output_format"])

        if conf.get("parse_method").lower() == "deepdoc":
-            # 注意：HTTP 调用中无法传递 callback，callback 将被忽略
-            ocr_service = get_ocr_service()
-            bboxes = ocr_service.parse_into_bboxes_sync(blob, callback=self.callback, filename=name)
+            bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
        elif conf.get("parse_method").lower() == "plain_text":
            lines, _ = PlainParser()(blob)
            bboxes = [{"text": t} for t, _ in lines]
+        elif conf.get("parse_method").lower() == "mineru":
+            mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
+            pdf_parser = MinerUParser(mineru_path=mineru_executable)
+            if not pdf_parser.check_installation():
+                raise RuntimeError("MinerU not found. Please install it via: pip install -U 'mineru[core]'.")
+
+            lines, _ = pdf_parser.parse_pdf(
+                filepath=name,
+                binary=blob,
+                callback=self.callback,
+                output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
+                delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+            )
+            bboxes = []
+            for t, poss in lines:
+                box = {
+                    "image": pdf_parser.crop(poss, 1),
+                    "positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)],
+                    "text": t,
+                }
+                bboxes.append(box)
        else:
            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
            bboxes = []
            for t, poss in lines:
-                pn, x0, x1, top, bott = poss.split(" ")
-                bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+                for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
+                    bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})

        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
@@ -250,13 +279,15 @@ class Parser(ProcessBase):
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
        docx_parser = Docx()
-        sections, tbls = docx_parser(name, binary=blob)
-        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
-        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
-        # json
-        assert conf.get("output_format") == "json", "have to be json for doc"
+
        if conf.get("output_format") == "json":
+            sections, tbls = docx_parser(name, binary=blob)
+            sections = [{"text": section[0], "image": section[1]} for section in sections if section]
+            sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
            self.set_output("json", sections)
+        elif conf.get("output_format") == "markdown":
+            markdown_text = docx_parser.to_markdown(name, binary=blob)
+            self.set_output("markdown", markdown_text)

    def _slides(self, name, blob):
        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
@@ -348,24 +379,34 @@ class Parser(ProcessBase):

        conf = self._param.setups["audio"]
        self.set_output("output_format", conf["output_format"])
-
-        lang = conf["lang"]
        _, ext = os.path.splitext(name)
        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
            tmpf.write(blob)
            tmpf.flush()
            tmp_path = os.path.abspath(tmpf.name)

-            seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
+            seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT)
            txt = seq2txt_mdl.transcription(tmp_path)

            self.set_output("text", txt)

+    def _video(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.")
+
+        conf = self._param.setups["video"]
+        self.set_output("output_format", conf["output_format"])
+
+        cv_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT)
+        txt = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name)
+
+        self.set_output("text", txt)
+
    def _email(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")

        email_content = {}
        conf = self._param.setups["email"]
+        self.set_output("output_format", conf["output_format"])
        target_fields = conf["fields"]

        _, ext = os.path.splitext(name)
@@ -403,8 +444,8 @@ class Parser(ProcessBase):

                _add_content(msg, msg.get_content_type())

-                email_content["text"] = body_text
-                email_content["text_html"] = body_html
+                email_content["text"] = "\n".join(body_text)
+                email_content["text_html"] = "\n".join(body_html)
            # get attachment
            if "attachments" in target_fields:
                attachments = []
@@ -414,7 +455,7 @@ class Parser(ProcessBase):
                        dispositions = content_disposition.strip().split(";")
                        if dispositions[0].lower() == "attachment":
                            filename = part.get_filename()
-                            payload = part.get_payload(decode=True)
+                            payload = part.get_payload(decode=True).decode(part.get_content_charset())
                            attachments.append({
                                "filename": filename,
                                "payload": payload,
@@ -442,15 +483,16 @@ class Parser(ProcessBase):
            }
            # get body
            if "body" in target_fields:
-                email_content["text"] = msg.body  # usually empty. try text_html instead
-                email_content["text_html"] = msg.htmlBody
+                email_content["text"] = msg.body[0] if isinstance(msg.body, list) and msg.body else msg.body
+                if not email_content["text"] and msg.htmlBody:
+                    email_content["text"] = msg.htmlBody[0] if isinstance(msg.htmlBody, list) and msg.htmlBody else msg.htmlBody
            # get attachments
            if "attachments" in target_fields:
                attachments = []
                for t in msg.attachments:
                    attachments.append({
                        "filename": t.name,
-                        "payload": t.data  # binary
+                        "payload": t.data.decode("utf-8")
                    })
                email_content["attachments"] = attachments

@@ -485,6 +527,7 @@ class Parser(ProcessBase):
            "word": self._word,
            "image": self._image,
            "audio": self._audio,
+            "video": self._video,
            "email": self._email,
        }
        try:
@@ -514,4 +557,4 @@ class Parser(ProcessBase):
        outs = self.output()
        async with trio.open_nursery() as nursery:
            for d in outs.get("json", []):
-                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
--- a/rag/flow/splitter/schema.py
+++ b/rag/flow/splitter/schema.py
@@ -25,7 +25,7 @@ class SplitterFromUpstream(BaseModel):
    file: dict | None = Field(default=None)
    chunks: list[dict[str, Any]] | None = Field(default=None)

-    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
+    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)

    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
    markdown_result: str | None = Field(default=None, alias="markdown")
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@@ -19,7 +19,7 @@ import trio

 from api.utils import get_uuid
 from api.utils.base64_image import id2image, image2id
-from ocr.service import get_ocr_service
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.splitter.schema import SplitterFromUpstream
 from rag.nlp import naive_merge, naive_merge_with_images
@@ -87,7 +87,7 @@ class Splitter(ProcessBase):
        sections, section_images = [], []
        for o in from_upstream.json_result or []:
            sections.append((o.get("text", ""), o.get("position_tag", "")))
-            section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
+            section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))

        chunks, images = naive_merge_with_images(
            sections,
@@ -96,20 +96,16 @@ class Splitter(ProcessBase):
            deli,
            self._param.overlapped_percent,
        )
-        ocr_service = get_ocr_service()
-        cks = []
-        for c, img in zip(chunks, images):
-            if not c.strip():
-                continue
-            cleaned_text = await ocr_service.remove_tag(c)
-            positions = await ocr_service.extract_positions(c)
-            cks.append({
-                "text": cleaned_text,
+        cks = [
+            {
+                "text": RAGFlowPdfParser.remove_tag(c),
                "image": img,
-                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in positions],
-            })
+                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
+            }
+            for c, img in zip(chunks, images) if c.strip()
+        ]
        async with trio.open_nursery() as nursery:
            for d in cks:
-                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
        self.set_output("chunks",  cks)
        self.callback(1, "Done.")
--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@@ -126,7 +126,7 @@ class Tokenizer(ProcessBase):
                    if ck.get("summary"):
                        ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
-                    else:
+                    elif ck.get("text"):
                        ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99:
@@ -155,6 +155,8 @@ class Tokenizer(ProcessBase):
                for i, ck in enumerate(chunks):
                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
+                    if not ck.get("text"):
+                        continue
                    ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99: