将flask改成fastapi

2025-10-13 13:18:03 +08:00
commit 88db2539b0
476 changed files with 739741 additions and 0 deletions
--- a/rag/flow/init.py
+++ b/rag/flow/init.py
@@ -0,0 +1,58 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import importlib
+import inspect
+import pkgutil
+from pathlib import Path
+from types import ModuleType
+from typing import Dict, Type
+
+__all_classes: Dict[str, Type] = {}
+
+_pkg_dir = Path(__file__).resolve().parent
+_pkg_name = __name__
+
+
+def _should_skip_module(mod_name: str) -> bool:
+    leaf = mod_name.rsplit(".", 1)[-1]
+    return leaf in {"__init__"} or leaf.startswith("__") or leaf.startswith("_") or leaf.startswith("base")
+
+
+def _import_submodules() -> None:
+    for modinfo in pkgutil.walk_packages([str(_pkg_dir)], prefix=_pkg_name + "."):  # noqa: F821
+        mod_name = modinfo.name
+        if _should_skip_module(mod_name):  # noqa: F821
+            continue
+        try:
+            module = importlib.import_module(mod_name)
+            _extract_classes_from_module(module)  # noqa: F821
+        except ImportError as e:
+            print(f"Warning: Failed to import module {mod_name}: {e}")
+
+
+def _extract_classes_from_module(module: ModuleType) -> None:
+    for name, obj in inspect.getmembers(module):
+        if inspect.isclass(obj) and obj.__module__ == module.__name__ and not name.startswith("_"):
+            __all_classes[name] = obj
+            globals()[name] = obj
+
+
+_import_submodules()
+
+__all__ = list(__all_classes.keys()) + ["__all_classes"]
+
+del _pkg_dir, _pkg_name, _import_submodules, _extract_classes_from_module
--- a/rag/flow/base.py
+++ b/rag/flow/base.py
@@ -0,0 +1,61 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import time
+from functools import partial
+from typing import Any
+import trio
+from agent.component.base import ComponentBase, ComponentParamBase
+from api.utils.api_utils import timeout
+
+
+class ProcessParamBase(ComponentParamBase):
+    def __init__(self):
+        super().__init__()
+        self.timeout = 100000000
+        self.persist_logs = True
+
+
+class ProcessBase(ComponentBase):
+    def __init__(self, pipeline, id, param: ProcessParamBase):
+        super().__init__(pipeline, id, param)
+        if hasattr(self._canvas, "callback"):
+            self.callback = partial(self._canvas.callback, id)
+        else:
+            self.callback = partial(lambda *args, **kwargs: None, id)
+
+    async def invoke(self, **kwargs) -> dict[str, Any]:
+        self.set_output("_created_time", time.perf_counter())
+        for k, v in kwargs.items():
+            self.set_output(k, v)
+        try:
+            with trio.fail_after(self._param.timeout):
+                await self._invoke(**kwargs)
+                self.callback(1, "Done")
+        except Exception as e:
+            if self.get_exception_default_value():
+                self.set_exception_default_value()
+            else:
+                self.set_output("_ERROR", str(e))
+            logging.exception(e)
+            self.callback(-1, str(e))
+        self.set_output("_elapsed_time", time.perf_counter() - self.output("_created_time"))
+        return self.output()
+
+    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60)))
+    async def _invoke(self, **kwargs):
+        raise NotImplementedError()
--- a/rag/flow/extractor/init.py
+++ b/rag/flow/extractor/init.py
@@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/extractor/extractor.py
+++ b/rag/flow/extractor/extractor.py
@@ -0,0 +1,63 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import random
+from copy import deepcopy
+from agent.component.llm import LLMParam, LLM
+from rag.flow.base import ProcessBase, ProcessParamBase
+
+
+class ExtractorParam(ProcessParamBase, LLMParam):
+    def __init__(self):
+        super().__init__()
+        self.field_name = ""
+
+    def check(self):
+        super().check()
+        self.check_empty(self.field_name, "Result Destination")
+
+
+class Extractor(ProcessBase, LLM):
+    component_name = "Extractor"
+
+    async def _invoke(self, **kwargs):
+        self.set_output("output_format", "chunks")
+        self.callback(random.randint(1, 5) / 100.0, "Start to generate.")
+        inputs = self.get_input_elements()
+        chunks = []
+        chunks_key = ""
+        args = {}
+        for k, v in inputs.items():
+            args[k] = v["value"]
+            if isinstance(args[k], list):
+                chunks = deepcopy(args[k])
+                chunks_key = k
+
+        if chunks:
+            prog = 0
+            for i, ck in enumerate(chunks):
+                args[chunks_key] = ck["text"]
+                msg, sys_prompt = self._sys_prompt_and_msg([], args)
+                msg.insert(0, {"role": "system", "content": sys_prompt})
+                ck[self._param.field_name] = self._generate(msg)
+                prog += 1./len(chunks)
+                if i % (len(chunks)//100+1) == 1:
+                    self.callback(prog, f"{i+1} / {len(chunks)}")
+            self.set_output("chunks", chunks)
+        else:
+            msg, sys_prompt = self._sys_prompt_and_msg([], args)
+            msg.insert(0, {"role": "system", "content": sys_prompt})
+            self.set_output("chunks", [{self._param.field_name: self._generate(msg)}])
+
+
--- a/rag/flow/extractor/schema.py
+++ b/rag/flow/extractor/schema.py
@@ -0,0 +1,38 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ExtractorFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)
+
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: str | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/file.py
+++ b/rag/flow/file.py
@@ -0,0 +1,50 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from api.db.services.document_service import DocumentService
+from rag.flow.base import ProcessBase, ProcessParamBase
+
+
+class FileParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+
+    def check(self):
+        pass
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class File(ProcessBase):
+    component_name = "File"
+
+    async def _invoke(self, **kwargs):
+        if self._canvas._doc_id:
+            e, doc = DocumentService.get_by_id(self._canvas._doc_id)
+            if not e:
+                self.set_output("_ERROR", f"Document({self._canvas._doc_id}) not found!")
+                return
+
+            #b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            #self.set_output("blob", STORAGE_IMPL.get(b, n))
+            self.set_output("name", doc.name)
+        else:
+            file = kwargs.get("file")
+            self.set_output("name", file["name"])
+            self.set_output("file", file)
+            #self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
+
+        self.callback(1, "File fetched.")
--- a/rag/flow/hierarchical_merger/init.py
+++ b/rag/flow/hierarchical_merger/init.py
@@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@@ -0,0 +1,186 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import random
+import re
+from copy import deepcopy
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
+from rag.nlp import concat_img
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class HierarchicalMergerParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.levels = []
+        self.hierarchy = None
+
+    def check(self):
+        self.check_empty(self.levels, "Hierarchical setups.")
+        self.check_empty(self.hierarchy, "Hierarchy number.")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class HierarchicalMerger(ProcessBase):
+    component_name = "HierarchicalMerger"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        self.set_output("output_format", "chunks")
+        self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            lines = [ln for ln in payload.split("\n") if ln]
+        else:
+            arr = from_upstream.chunks if from_upstream.output_format == "chunks" else from_upstream.json_result
+            lines = [o.get("text", "") for o in arr]
+            sections, section_images = [], []
+            for o in arr or []:
+                sections.append((o.get("text", ""), o.get("position_tag", "")))
+                section_images.append(o.get("img_id"))
+
+        matches = []
+        for txt in lines:
+            good = False
+            for lvl, regs in enumerate(self._param.levels):
+                for reg in regs:
+                    if re.search(reg, txt):
+                        matches.append(lvl)
+                        good = True
+                        break
+                if good:
+                    break
+            if not good:
+                matches.append(len(self._param.levels))
+        assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
+
+        root = {
+            "level": -1,
+            "index": -1,
+            "texts": [],
+            "children": []
+        }
+        for i, m in enumerate(matches):
+            if m == 0:
+                root["children"].append({
+                    "level": m,
+                    "index": i,
+                    "texts": [],
+                    "children": []
+                })
+            elif m == len(self._param.levels):
+                def dfs(b):
+                    if not b["children"]:
+                        b["texts"].append(i)
+                    else:
+                        dfs(b["children"][-1])
+                dfs(root)
+            else:
+                def dfs(b):
+                    nonlocal m, i
+                    if not b["children"] or  m == b["level"] + 1:
+                        b["children"].append({
+                            "level": m,
+                            "index": i,
+                            "texts": [],
+                            "children": []
+                        })
+                        return
+                    dfs(b["children"][-1])
+
+                dfs(root)
+
+        all_pathes = []
+        def dfs(n, path, depth):
+            nonlocal all_pathes
+            if not n["children"] and path:
+                all_pathes.append(path)
+
+            for nn in n["children"]:
+                if depth < self._param.hierarchy:
+                    _path = deepcopy(path)
+                else:
+                    _path = path
+                _path.extend([nn["index"], *nn["texts"]])
+                dfs(nn, _path, depth+1)
+
+                if depth == self._param.hierarchy:
+                    all_pathes.append(_path)
+
+        for i in range(len(lines)):
+            print(i, lines[i])
+        dfs(root, [], 0)
+
+        if root["texts"]:
+            all_pathes.insert(0, root["texts"])
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            cks = []
+            for path in all_pathes:
+                txt = ""
+                for i in path:
+                    txt += lines[i] + "\n"
+                cks.append(txt)
+
+            self.set_output("chunks", [{"text": c} for c in cks if c])
+        else:
+            cks = []
+            images = []
+            for path in all_pathes:
+                txt = ""
+                img = None
+                for i in path:
+                    txt += lines[i] + "\n"
+                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
+                cks.append(txt)
+                images.append(img)
+
+            cks = [
+                {
+                    "text": RAGFlowPdfParser.remove_tag(c),
+                    "image": img,
+                    "positions": RAGFlowPdfParser.extract_positions(c),
+                }
+                for c, img in zip(cks, images)
+            ]
+            async with trio.open_nursery() as nursery:
+                for d in cks:
+                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+            self.set_output("chunks", cks)
+
+        self.callback(1, "Done.")
--- a/rag/flow/hierarchical_merger/schema.py
+++ b/rag/flow/hierarchical_merger/schema.py
@@ -0,0 +1,37 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class HierarchicalMergerFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "chunks"] | None = Field(default=None)
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: str | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/parser/init.py
+++ b/rag/flow/parser/init.py
@@ -0,0 +1,14 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -0,0 +1,514 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import io
+import json
+import os
+import random
+from functools import partial
+
+import trio
+import numpy as np
+from PIL import Image
+
+from api.db import LLMType
+from api.db.services.file2document_service import File2DocumentService
+from api.db.services.file_service import FileService
+from api.db.services.llm_service import LLMBundle
+from api.utils import get_uuid
+from api.utils.base64_image import image2id
+from deepdoc.parser import ExcelParser
+from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
+from rag.app.naive import Docx
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.parser.schema import ParserFromUpstream
+from rag.llm.cv_model import Base as VLM
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class ParserParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.allowed_output_format = {
+            "pdf": [
+                "json",
+                "markdown",
+            ],
+            "spreadsheet": [
+                "json",
+                "markdown",
+                "html",
+            ],
+            "word": [
+                "json",
+            ],
+            "slides": [
+                "json",
+            ],
+            "image": [
+                "text"
+            ],
+            "email": ["text", "json"],
+            "text&markdown": [
+                "text",
+                "json"
+            ],
+            "audio": [
+                "json"
+            ],
+            "video": [],
+        }
+
+        self.setups = {
+            "pdf": {
+                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
+                "lang": "Chinese",
+                "suffix": [
+                    "pdf",
+                ],
+                "output_format": "json",
+            },
+            "spreadsheet": {
+                "output_format": "html",
+                "suffix": [
+                    "xls",
+                    "xlsx",
+                    "csv",
+                ],
+            },
+            "word": {
+                "suffix": [
+                    "doc",
+                    "docx",
+                ],
+                "output_format": "json",
+            },
+            "text&markdown": {
+                "suffix": ["md", "markdown", "mdx", "txt"],
+                "output_format": "json",
+            },
+            "slides": {
+                "suffix": [
+                    "pptx",
+                ],
+                "output_format": "json",
+            },
+            "image": {
+                "parse_method": "ocr",
+                "llm_id": "",
+                "lang": "Chinese",
+                "system_prompt": "",
+                "suffix": ["jpg", "jpeg", "png", "gif"],
+                "output_format": "text",
+            },
+            "email": {
+                "suffix": [
+                  "eml", "msg"
+                ],
+                "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
+                "output_format": "json",
+            },
+            "audio": {
+                "suffix":[
+                    "da",
+                    "wave",
+                    "wav",
+                    "mp3",
+                    "aac",
+                    "flac",
+                    "ogg",
+                    "aiff",
+                    "au",
+                    "midi",
+                    "wma",
+                    "realaudio",
+                    "vqf",
+                    "oggvorbis",
+                    "ape"
+                ],
+                "output_format": "json",
+            },
+            "video": {},
+        }
+
+    def check(self):
+        pdf_config = self.setups.get("pdf", {})
+        if pdf_config:
+            pdf_parse_method = pdf_config.get("parse_method", "")
+            self.check_empty(pdf_parse_method, "Parse method abnormal.")
+
+            if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
+                self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
+
+            pdf_output_format = pdf_config.get("output_format", "")
+            self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
+
+        spreadsheet_config = self.setups.get("spreadsheet", "")
+        if spreadsheet_config:
+            spreadsheet_output_format = spreadsheet_config.get("output_format", "")
+            self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
+
+        doc_config = self.setups.get("word", "")
+        if doc_config:
+            doc_output_format = doc_config.get("output_format", "")
+            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
+
+        slides_config = self.setups.get("slides", "")
+        if slides_config:
+            slides_output_format = slides_config.get("output_format", "")
+            self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
+
+        image_config = self.setups.get("image", "")
+        if image_config:
+            image_parse_method = image_config.get("parse_method", "")
+            if image_parse_method not in ["ocr"]:
+                self.check_empty(image_config.get("lang", ""), "Image VLM language")
+
+        text_config = self.setups.get("text&markdown", "")
+        if text_config:
+            text_output_format = text_config.get("output_format", "")
+            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
+
+        audio_config = self.setups.get("audio", "")
+        if audio_config:
+            self.check_empty(audio_config.get("llm_id"), "Audio VLM")
+            audio_language = audio_config.get("lang", "")
+            self.check_empty(audio_language, "Language")
+
+        email_config = self.setups.get("email", "")
+        if email_config:
+            email_output_format = email_config.get("output_format", "")
+            self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class Parser(ProcessBase):
+    component_name = "Parser"
+
+    def _pdf(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
+        conf = self._param.setups["pdf"]
+        self.set_output("output_format", conf["output_format"])
+
+        if conf.get("parse_method").lower() == "deepdoc":
+            bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
+        elif conf.get("parse_method").lower() == "plain_text":
+            lines, _ = PlainParser()(blob)
+            bboxes = [{"text": t} for t, _ in lines]
+        else:
+            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
+            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
+            bboxes = []
+            for t, poss in lines:
+                pn, x0, x1, top, bott = poss.split(" ")
+                bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+
+        if conf.get("output_format") == "json":
+            self.set_output("json", bboxes)
+        if conf.get("output_format") == "markdown":
+            mkdn = ""
+            for b in bboxes:
+                if b.get("layout_type", "") == "title":
+                    mkdn += "\n## "
+                if b.get("layout_type", "") == "figure":
+                    mkdn += "\n![Image]({})".format(VLM.image2base64(b["image"]))
+                    continue
+                mkdn += b.get("text", "") + "\n"
+            self.set_output("markdown", mkdn)
+
+    def _spreadsheet(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
+        conf = self._param.setups["spreadsheet"]
+        self.set_output("output_format", conf["output_format"])
+        spreadsheet_parser = ExcelParser()
+        if conf.get("output_format") == "html":
+            htmls = spreadsheet_parser.html(blob, 1000000000)
+            self.set_output("html", htmls[0])
+        elif conf.get("output_format") == "json":
+            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
+        elif conf.get("output_format") == "markdown":
+            self.set_output("markdown", spreadsheet_parser.markdown(blob))
+
+    def _word(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
+        conf = self._param.setups["word"]
+        self.set_output("output_format", conf["output_format"])
+        docx_parser = Docx()
+        sections, tbls = docx_parser(name, binary=blob)
+        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
+        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
+        # json
+        assert conf.get("output_format") == "json", "have to be json for doc"
+        if conf.get("output_format") == "json":
+            self.set_output("json", sections)
+
+    def _slides(self, name, blob):
+        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
+
+        conf = self._param.setups["slides"]
+        self.set_output("output_format", conf["output_format"])
+
+        ppt_parser = ppt_parser()
+        txts = ppt_parser(blob, 0, 100000, None)
+
+        sections = [{"text": section} for section in txts if section.strip()]
+
+        # json
+        assert conf.get("output_format") == "json", "have to be json for ppt"
+        if conf.get("output_format") == "json":
+            self.set_output("json", sections)
+
+    def _markdown(self, name, blob):
+        from functools import reduce
+
+        from rag.app.naive import Markdown as naive_markdown_parser
+        from rag.nlp import concat_img
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
+        conf = self._param.setups["text&markdown"]
+        self.set_output("output_format", conf["output_format"])
+
+        markdown_parser = naive_markdown_parser()
+        sections, tables = markdown_parser(name, blob, separate_tables=False)
+
+        if conf.get("output_format") == "json":
+            json_results = []
+
+            for section_text, _ in sections:
+                json_result = {
+                    "text": section_text,
+                }
+
+                images = markdown_parser.get_pictures(section_text) if section_text else None
+                if images:
+                    # If multiple images found, combine them using concat_img
+                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
+                    json_result["image"] = combined_image
+
+                json_results.append(json_result)
+
+            self.set_output("json", json_results)
+        else:
+            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
+
+
+    def _image(self, name, blob):
+        from deepdoc.vision import OCR
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
+        conf = self._param.setups["image"]
+        self.set_output("output_format", conf["output_format"])
+
+        img = Image.open(io.BytesIO(blob)).convert("RGB")
+
+        if conf["parse_method"] == "ocr":
+            # use ocr, recognize chars only
+            ocr = OCR()
+            bxs = ocr(np.array(img))  # return boxes and recognize result
+            txt = "\n".join([t[0] for _, t in bxs if t[0]])
+        else:
+            lang = conf["lang"]
+            # use VLM to describe the picture
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["parse_method"], lang=lang)
+            img_binary = io.BytesIO()
+            img.save(img_binary, format="JPEG")
+            img_binary.seek(0)
+
+            system_prompt = conf.get("system_prompt")
+            if system_prompt:
+                txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
+            else:
+                txt = cv_model.describe(img_binary.read())
+
+        self.set_output("text", txt)
+
+    def _audio(self, name, blob):
+        import os
+        import tempfile
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
+
+        conf = self._param.setups["audio"]
+        self.set_output("output_format", conf["output_format"])
+
+        lang = conf["lang"]
+        _, ext = os.path.splitext(name)
+        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
+            tmpf.write(blob)
+            tmpf.flush()
+            tmp_path = os.path.abspath(tmpf.name)
+
+            seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
+            txt = seq2txt_mdl.transcription(tmp_path)
+
+            self.set_output("text", txt)
+
+    def _email(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
+
+        email_content = {}
+        conf = self._param.setups["email"]
+        target_fields = conf["fields"]
+
+        _, ext = os.path.splitext(name)
+        if ext == ".eml":
+            # handle eml file
+            from email import policy
+            from email.parser import BytesParser
+
+            msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
+            email_content['metadata'] = {}
+            # handle header info
+            for header, value in msg.items():
+                # get fields like from, to, cc, bcc, date, subject
+                if header.lower() in target_fields:
+                    email_content[header.lower()] = value
+                # get metadata
+                elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
+                    email_content["metadata"][header.lower()] = value
+            # get body
+            if "body" in target_fields:
+                body_text, body_html = [], []
+                def _add_content(m, content_type):
+                    if content_type == "text/plain":
+                        body_text.append(
+                            m.get_payload(decode=True).decode(m.get_content_charset())
+                        )
+                    elif content_type == "text/html":
+                        body_html.append(
+                            m.get_payload(decode=True).decode(m.get_content_charset())
+                        )
+                    elif "multipart" in content_type:
+                        if m.is_multipart():
+                            for part in m.iter_parts():
+                                _add_content(part, part.get_content_type())
+
+                _add_content(msg, msg.get_content_type())
+
+                email_content["text"] = body_text
+                email_content["text_html"] = body_html
+            # get attachment
+            if "attachments" in target_fields:
+                attachments = []
+                for part in msg.iter_attachments():
+                    content_disposition = part.get("Content-Disposition")
+                    if content_disposition:
+                        dispositions = content_disposition.strip().split(";")
+                        if dispositions[0].lower() == "attachment":
+                            filename = part.get_filename()
+                            payload = part.get_payload(decode=True)
+                            attachments.append({
+                                "filename": filename,
+                                "payload": payload,
+                            })
+                email_content["attachments"] = attachments
+        else:
+            # handle msg file
+            import extract_msg
+            print("handle a msg file.")
+            msg = extract_msg.Message(blob)
+            # handle header info
+            basic_content = {
+                "from": msg.sender,
+                "to": msg.to,
+                "cc": msg.cc,
+                "bcc": msg.bcc,
+                "date": msg.date,
+                "subject": msg.subject,
+            }
+            email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
+            # get metadata
+            email_content['metadata'] = {
+                'message_id': msg.messageId,
+                'in_reply_to': msg.inReplyTo,
+            }
+            # get body
+            if "body" in target_fields:
+                email_content["text"] = msg.body  # usually empty. try text_html instead
+                email_content["text_html"] = msg.htmlBody
+            # get attachments
+            if "attachments" in target_fields:
+                attachments = []
+                for t in msg.attachments:
+                    attachments.append({
+                        "filename": t.name,
+                        "payload": t.data  # binary
+                    })
+                email_content["attachments"] = attachments
+
+        if conf["output_format"] == "json":
+            self.set_output("json", [email_content])
+        else:
+            content_txt = ''
+            for k, v in email_content.items():
+                if isinstance(v, str):
+                    # basic info
+                    content_txt += f'{k}:{v}' + "\n"
+                elif isinstance(v, dict):
+                    # metadata
+                    content_txt += f'{k}:{json.dumps(v)}' + "\n"
+                elif isinstance(v, list):
+                    # attachments or others
+                    for fb in v:
+                        if isinstance(fb, dict):
+                            # attachments
+                            content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
+                        else:
+                            # str, usually plain text
+                            content_txt += fb
+            self.set_output("text", content_txt)
+
+    async def _invoke(self, **kwargs):
+        function_map = {
+            "pdf": self._pdf,
+            "text&markdown": self._markdown,
+            "spreadsheet": self._spreadsheet,
+            "slides": self._slides,
+            "word": self._word,
+            "image": self._image,
+            "audio": self._audio,
+            "email": self._email,
+        }
+        try:
+            from_upstream = ParserFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        name = from_upstream.name
+        if self._canvas._doc_id:
+            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            blob = STORAGE_IMPL.get(b, n)
+        else:
+            blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
+
+        done = False
+        for p_type, conf in self._param.setups.items():
+            if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
+                continue
+            await trio.to_thread.run_sync(function_map[p_type], name, blob)
+            done = True
+            break
+
+        if not done:
+            raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
+
+        outs = self.output()
+        async with trio.open_nursery() as nursery:
+            for d in outs.get("json", []):
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
--- a/rag/flow/parser/schema.py
+++ b/rag/flow/parser/schema.py
@@ -0,0 +1,24 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ParserFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@@ -0,0 +1,174 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import datetime
+import json
+import logging
+import random
+from timeit import default_timer as timer
+import trio
+from agent.canvas import Graph
+from api.db.services.document_service import DocumentService
+from api.db.services.task_service import has_canceled, TaskService, CANVAS_DEBUG_DOC_ID
+from rag.utils.redis_conn import REDIS_CONN
+
+
+class Pipeline(Graph):
+    def __init__(self, dsl: str|dict, tenant_id=None, doc_id=None, task_id=None, flow_id=None):
+        if isinstance(dsl, dict):
+            dsl = json.dumps(dsl, ensure_ascii=False)
+        super().__init__(dsl, tenant_id, task_id)
+        if doc_id == CANVAS_DEBUG_DOC_ID:
+            doc_id = None
+        self._doc_id = doc_id
+        self._flow_id = flow_id
+        self._kb_id = None
+        if self._doc_id:
+            self._kb_id = DocumentService.get_knowledgebase_id(doc_id)
+            if not self._kb_id:
+                self._doc_id = None
+
+    def callback(self, component_name: str, progress: float | int | None = None, message: str = "") -> None:
+        from rag.svr.task_executor import TaskCanceledException
+        log_key = f"{self._flow_id}-{self.task_id}-logs"
+        timestamp = timer()
+        if has_canceled(self.task_id):
+            progress = -1
+            message += "[CANCEL]"
+        try:
+            bin = REDIS_CONN.get(log_key)
+            obj = json.loads(bin.encode("utf-8"))
+            if obj:
+                if obj[-1]["component_id"] == component_name:
+                    obj[-1]["trace"].append(
+                        {
+                            "progress": progress,
+                            "message": message,
+                            "datetime": datetime.datetime.now().strftime("%H:%M:%S"),
+                            "timestamp": timestamp,
+                            "elapsed_time": timestamp - obj[-1]["trace"][-1]["timestamp"],
+                        }
+                    )
+                else:
+                    obj.append(
+                        {
+                            "component_id": component_name,
+                            "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
+                        }
+                    )
+            else:
+                obj = [
+                    {
+                        "component_id": component_name,
+                        "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
+                    }
+                ]
+            if component_name != "END" and self._doc_id and self.task_id:
+                percentage = 1.0 / len(self.components.items())
+                finished = 0.0
+                for o in obj:
+                    for t in o["trace"]:
+                        if t["progress"] < 0:
+                            finished = -1
+                            break
+                    if finished < 0:
+                        break
+                    finished += o["trace"][-1]["progress"] * percentage
+
+                msg = ""
+                if len(obj[-1]["trace"]) == 1:
+                    msg += f"\n-------------------------------------\n[{self.get_component_name(o['component_id'])}]:\n"
+                t = obj[-1]["trace"][-1]
+                msg += "%s: %s\n" % (t["datetime"], t["message"])
+                TaskService.update_progress(self.task_id, {"progress": finished, "progress_msg": msg})
+            elif component_name == "END" and not self._doc_id:
+                obj[-1]["trace"][-1]["dsl"] = json.loads(str(self))
+            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
+
+        except Exception as e:
+            logging.exception(e)
+
+        if has_canceled(self.task_id):
+            raise TaskCanceledException(message)
+
+    def fetch_logs(self):
+        log_key = f"{self._flow_id}-{self.task_id}-logs"
+        try:
+            bin = REDIS_CONN.get(log_key)
+            if bin:
+                return json.loads(bin.encode("utf-8"))
+        except Exception as e:
+            logging.exception(e)
+        return []
+
+
+    async def run(self, **kwargs):
+        log_key = f"{self._flow_id}-{self.task_id}-logs"
+        try:
+            REDIS_CONN.set_obj(log_key, [], 60 * 10)
+        except Exception as e:
+            logging.exception(e)
+        self.error = ""
+        if not self.path:
+            self.path.append("File")
+            cpn_obj = self.get_component_obj(self.path[0])
+            await cpn_obj.invoke(**kwargs)
+            if cpn_obj.error():
+                self.error = "[ERROR]" + cpn_obj.error()
+                self.callback(cpn_obj.component_name, -1, self.error)
+
+        if self._doc_id:
+            TaskService.update_progress(self.task_id, {
+                "progress": random.randint(0, 5) / 100.0,
+                "progress_msg": "Start the pipeline...",
+                "begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
+
+        idx = len(self.path) - 1
+        cpn_obj = self.get_component_obj(self.path[idx])
+        idx += 1
+        self.path.extend(cpn_obj.get_downstream())
+
+        while idx < len(self.path) and not self.error:
+            last_cpn = self.get_component_obj(self.path[idx - 1])
+            cpn_obj = self.get_component_obj(self.path[idx])
+
+            async def invoke():
+                nonlocal last_cpn, cpn_obj
+                await cpn_obj.invoke(**last_cpn.output())
+                #if inspect.iscoroutinefunction(cpn_obj.invoke):
+                #    await cpn_obj.invoke(**last_cpn.output())
+                #else:
+                #    cpn_obj.invoke(**last_cpn.output())
+
+            async with trio.open_nursery() as nursery:
+                nursery.start_soon(invoke)
+
+            if cpn_obj.error():
+                self.error = "[ERROR]" + cpn_obj.error()
+                self.callback(cpn_obj._id, -1, self.error)
+                break
+            idx += 1
+            self.path.extend(cpn_obj.get_downstream())
+
+        self.callback("END", 1 if not self.error else -1, json.dumps(self.get_component_obj(self.path[-1]).output(), ensure_ascii=False))
+
+        if not self.error:
+            return self.get_component_obj(self.path[-1]).output()
+
+        TaskService.update_progress(self.task_id, {
+            "progress": -1,
+            "progress_msg": f"[ERROR]: {self.error}"})
+
+        return {}
--- a/rag/flow/splitter/init.py
+++ b/rag/flow/splitter/init.py
@@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/splitter/schema.py
+++ b/rag/flow/splitter/schema.py
@@ -0,0 +1,38 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class SplitterFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
+
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: str | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@@ -0,0 +1,111 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import random
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.splitter.schema import SplitterFromUpstream
+from rag.nlp import naive_merge, naive_merge_with_images
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class SplitterParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.chunk_token_size = 512
+        self.delimiters = ["\n"]
+        self.overlapped_percent = 0
+
+    def check(self):
+        self.check_empty(self.delimiters, "Delimiters.")
+        self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
+        self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class Splitter(ProcessBase):
+    component_name = "Splitter"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = SplitterFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        deli = ""
+        for d in self._param.delimiters:
+            if len(d) > 1:
+                deli += f"`{d}`"
+            else:
+                deli += d
+
+        self.set_output("output_format", "chunks")
+        self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            cks = naive_merge(
+                payload,
+                self._param.chunk_token_size,
+                deli,
+                self._param.overlapped_percent,
+            )
+            self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
+
+            self.callback(1, "Done.")
+            return
+
+        # json
+        sections, section_images = [], []
+        for o in from_upstream.json_result or []:
+            sections.append((o.get("text", ""), o.get("position_tag", "")))
+            section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
+
+        chunks, images = naive_merge_with_images(
+            sections,
+            section_images,
+            self._param.chunk_token_size,
+            deli,
+            self._param.overlapped_percent,
+        )
+        cks = [
+            {
+                "text": RAGFlowPdfParser.remove_tag(c),
+                "image": img,
+                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
+            }
+            for c, img in zip(chunks, images) if c.strip()
+        ]
+        async with trio.open_nursery() as nursery:
+            for d in cks:
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+        self.set_output("chunks",  cks)
+        self.callback(1, "Done.")
--- a/rag/flow/tests/client.py
+++ b/rag/flow/tests/client.py
@@ -0,0 +1,61 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import argparse
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import trio
+
+from api import settings
+from rag.flow.pipeline import Pipeline
+
+
+def print_logs(pipeline: Pipeline):
+    last_logs = "[]"
+    while True:
+        time.sleep(5)
+        logs = pipeline.fetch_logs()
+        logs_str = json.dumps(logs, ensure_ascii=False)
+        if logs_str != last_logs:
+            print(logs_str)
+        last_logs = logs_str
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    dsl_default_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "dsl_examples",
+        "general_pdf_all.json",
+    )
+    parser.add_argument("-s", "--dsl", default=dsl_default_path, help="input dsl", action="store", required=False)
+    parser.add_argument("-d", "--doc_id", default=False, help="Document ID", action="store", required=True)
+    parser.add_argument("-t", "--tenant_id", default=False, help="Tenant ID", action="store", required=True)
+    args = parser.parse_args()
+
+    settings.init_settings()
+    pipeline = Pipeline(open(args.dsl, "r").read(), tenant_id=args.tenant_id, doc_id=args.doc_id, task_id="xxxx", flow_id="xxx")
+    pipeline.reset()
+
+    exe = ThreadPoolExecutor(max_workers=5)
+    thr = exe.submit(print_logs, pipeline)
+
+    # queue_dataflow(dsl=open(args.dsl, "r").read(), tenant_id=args.tenant_id, doc_id=args.doc_id, task_id="xxxx", flow_id="xxx", priority=0)
+
+    trio.run(pipeline.run)
+    thr.result()
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@@ -0,0 +1,139 @@
+{
+  "components": {
+    "File": {
+        "obj":{
+            "component_name": "File",
+            "params": {
+            }
+        },
+        "downstream": ["Parser:0"],
+        "upstream": []
+    },
+    "Parser:0": {
+        "obj": {
+            "component_name": "Parser",
+            "params": {
+              "setups": {
+                "pdf": {
+                  "parse_method": "deepdoc",
+                  "vlm_name": "",
+                  "lang": "Chinese",
+                  "suffix": [
+                    "pdf"
+                  ],
+                  "output_format": "json"
+                },
+                "spreadsheet": {
+                  "suffix": [
+                    "xls",
+                    "xlsx",
+                    "csv"
+                  ],
+                  "output_format": "html"
+                },
+                "word": {
+                  "suffix": [
+                    "doc",
+                    "docx"
+                  ],
+                  "output_format": "json"
+                },
+                "slides": {
+                    "parse_method": "presentation",
+                    "suffix": [
+                        "pptx"
+                    ],
+                    "output_format": "json"
+                },
+                "markdown": {
+                  "suffix": [
+                    "md",
+                    "markdown"
+                  ],
+                  "output_format": "json"
+                },
+                "text": {
+                  "suffix": ["txt"],
+                  "output_format": "json"
+                },
+                "image": {
+                  "parse_method": "vlm",
+                  "llm_id":"glm-4.5v",
+                  "lang": "Chinese",
+                  "suffix": [
+                    "jpg",
+                    "jpeg",
+                    "png",
+                    "gif"
+                  ],
+                  "output_format": "text"
+                },
+                "audio": {
+                  "suffix": [
+                    "da",
+                    "wave",
+                    "wav",
+                    "mp3",
+                    "aac",
+                    "flac",
+                    "ogg",
+                    "aiff",
+                    "au",
+                    "midi",
+                    "wma",
+                    "realaudio",
+                    "vqf",
+                    "oggvorbis",
+                    "ape"
+                  ],
+                  "lang": "Chinese",
+                  "llm_id": "SenseVoiceSmall",
+                  "output_format": "json"
+                },
+                "email": {
+                  "suffix": [
+                    "msg"
+                  ],
+                  "fields": [
+                    "from",
+                    "to",
+                    "cc",
+                    "bcc",
+                    "date",
+                    "subject",
+                    "body",
+                    "attachments"
+                  ],
+                  "output_format": "json"
+                }
+              }
+          }
+        },
+        "downstream": ["Splitter:0"],
+        "upstream": ["Begin"]
+    },
+    "Splitter:0": {
+        "obj": {
+            "component_name": "Splitter",
+            "params": {
+              "chunk_token_size": 512,
+              "delimiters": ["\n"],
+              "overlapped_percent": 0
+            }
+        },
+        "downstream": ["Tokenizer:0"],
+        "upstream": ["Parser:0"]
+    },
+    "Tokenizer:0": {
+        "obj": {
+            "component_name": "Tokenizer",
+            "params": {
+            }
+        },
+        "downstream": [],
+        "upstream": ["Chunker:0"]
+    }
+  },
+  "path": []
+}
+
--- a/rag/flow/tests/dsl_examples/hierarchical_merger.json
+++ b/rag/flow/tests/dsl_examples/hierarchical_merger.json
@@ -0,0 +1,84 @@
+{
+  "components": {
+    "File": {
+        "obj":{
+            "component_name": "File",
+            "params": {
+            }
+        },
+        "downstream": ["Parser:0"],
+        "upstream": []
+    },
+    "Parser:0": {
+        "obj": {
+            "component_name": "Parser",
+            "params": {
+              "setups": {
+                "pdf": {
+                  "parse_method": "deepdoc",
+                  "vlm_name": "",
+                  "lang": "Chinese",
+                  "suffix": [
+                    "pdf"
+                  ],
+                  "output_format": "json"
+                },
+                "spreadsheet": {
+                  "suffix": [
+                    "xls",
+                    "xlsx",
+                    "csv"
+                  ],
+                  "output_format": "html"
+                },
+                "word": {
+                  "suffix": [
+                    "doc",
+                    "docx"
+                  ],
+                  "output_format": "json"
+                },
+                "markdown": {
+                  "suffix": [
+                    "md",
+                    "markdown"
+                  ],
+                  "output_format": "text"
+                },
+                "text": {
+                  "suffix": ["txt"],
+                  "output_format": "json"
+                }
+              }
+          }
+        },
+        "downstream": ["Splitter:0"],
+        "upstream": ["File"]
+    },
+    "Splitter:0": {
+        "obj": {
+            "component_name": "Splitter",
+            "params": {
+              "chunk_token_size": 512,
+              "delimiters": ["\r\n"],
+              "overlapped_percent": 0
+            }
+        },
+        "downstream": ["HierarchicalMerger:0"],
+        "upstream": ["Parser:0"]
+    },
+    "HierarchicalMerger:0": {
+        "obj": {
+            "component_name": "HierarchicalMerger",
+            "params": {
+              "levels": [["^#[^#]"], ["^##[^#]"], ["^###[^#]"], ["^####[^#]"]],
+              "hierarchy": 2
+            }
+        },
+        "downstream": [],
+        "upstream": ["Splitter:0"]
+    }
+  },
+  "path": []
+}
+
--- a/rag/flow/tokenizer/init.py
+++ b/rag/flow/tokenizer/init.py
@@ -0,0 +1,14 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
--- a/rag/flow/tokenizer/schema.py
+++ b/rag/flow/tokenizer/schema.py
@@ -0,0 +1,53 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+
+
+class TokenizerFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str = ""
+    file: dict | None = Field(default=None)
+
+    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)
+
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: str | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    @model_validator(mode="after")
+    def _check_payloads(self) -> "TokenizerFromUpstream":
+        if self.chunks:
+            return self
+
+        if self.output_format in {"markdown", "text", "html"}:
+            if self.output_format == "markdown" and not self.markdown_result:
+                raise ValueError("output_format=markdown requires a markdown payload (field: 'markdown' or 'markdown_result').")
+            if self.output_format == "text" and not self.text_result:
+                raise ValueError("output_format=text requires a text payload (field: 'text' or 'text_result').")
+            if self.output_format == "html" and not self.html_result:
+                raise ValueError("output_format=text requires a html payload (field: 'html' or 'html_result').")
+        else:
+            if not self.json_result and not self.chunks:
+                raise ValueError("When no chunks are provided and output_format is not markdown/text, a JSON list payload is required (field: 'json' or 'json_result').")
+        return self
--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@@ -0,0 +1,176 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import logging
+import random
+import re
+
+import numpy as np
+import trio
+
+from api.db import LLMType
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.db.services.llm_service import LLMBundle
+from api.db.services.user_service import TenantService
+from api.utils.api_utils import timeout
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.tokenizer.schema import TokenizerFromUpstream
+from rag.nlp import rag_tokenizer
+from rag.settings import EMBEDDING_BATCH_SIZE
+from rag.svr.task_executor import embed_limiter
+from rag.utils import truncate
+
+
+class TokenizerParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.search_method = ["full_text", "embedding"]
+        self.filename_embd_weight = 0.1
+        self.fields = ["text"]
+
+    def check(self):
+        for v in self.search_method:
+            self.check_valid_value(v.lower(), "Chunk method abnormal.", ["full_text", "embedding"])
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class Tokenizer(ProcessBase):
+    component_name = "Tokenizer"
+
+    async def _embedding(self, name, chunks):
+        parts = sum(["full_text" in self._param.search_method, "embedding" in self._param.search_method])
+        token_count = 0
+        if self._canvas._kb_id:
+            e, kb = KnowledgebaseService.get_by_id(self._canvas._kb_id)
+            embedding_id = kb.embd_id
+        else:
+            e, ten = TenantService.get_by_id(self._canvas._tenant_id)
+            embedding_id = ten.embd_id
+        embedding_model = LLMBundle(self._canvas._tenant_id, LLMType.EMBEDDING, llm_name=embedding_id)
+        texts = []
+        for c in chunks:
+            txt = ""
+            for f in self._param.fields:
+                f = c.get(f)
+                if isinstance(f, str):
+                    txt += f
+                elif isinstance(f, list):
+                    txt += "\n".join(f)
+            texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
+        vts, c = embedding_model.encode([name])
+        token_count += c
+        tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)
+
+        @timeout(60)
+        def batch_encode(txts):
+            nonlocal embedding_model
+            return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts])
+
+        cnts_ = np.array([])
+        for i in range(0, len(texts), EMBEDDING_BATCH_SIZE):
+            async with embed_limiter:
+                vts, c = await trio.to_thread.run_sync(lambda: batch_encode(texts[i : i + EMBEDDING_BATCH_SIZE]))
+            if len(cnts_) == 0:
+                cnts_ = vts
+            else:
+                cnts_ = np.concatenate((cnts_, vts), axis=0)
+            token_count += c
+            if i % 33 == 32:
+                self.callback(i * 1.0 / len(texts) / parts / EMBEDDING_BATCH_SIZE + 0.5 * (parts - 1))
+
+        cnts = cnts_
+        title_w = float(self._param.filename_embd_weight)
+        vects = (title_w * tts + (1 - title_w) * cnts) if len(tts) == len(cnts) else cnts
+
+        assert len(vects) == len(chunks)
+        for i, ck in enumerate(chunks):
+            v = vects[i].tolist()
+            ck["q_%d_vec" % len(v)] = v
+        return chunks, token_count
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = TokenizerFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        self.set_output("output_format", "chunks")
+        parts = sum(["full_text" in self._param.search_method, "embedding" in self._param.search_method])
+        if "full_text" in self._param.search_method:
+            self.callback(random.randint(1, 5) / 100.0, "Start to tokenize.")
+            if from_upstream.chunks:
+                chunks = from_upstream.chunks
+                for i, ck in enumerate(chunks):
+                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
+                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
+                    if ck.get("questions"):
+                        ck["question_kwd"] = ck["questions"].split("\n")
+                        ck["question_tks"] = rag_tokenizer.tokenize(str(ck["questions"]))
+                    if ck.get("keywords"):
+                        ck["important_kwd"] = ck["keywords"].split(",")
+                        ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"]))
+                    if ck.get("summary"):
+                        ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
+                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                    else:
+                        ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
+                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                    if i % 100 == 99:
+                        self.callback(i * 1.0 / len(chunks) / parts)
+
+            elif from_upstream.output_format in ["markdown", "text", "html"]:
+                if from_upstream.output_format == "markdown":
+                    payload = from_upstream.markdown_result
+                elif from_upstream.output_format == "text":
+                    payload = from_upstream.text_result
+                else:
+                    payload = from_upstream.html_result
+
+                if not payload:
+                    return ""
+
+                ck = {"text": payload}
+                if "full_text" in self._param.search_method:
+                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
+                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
+                    ck["content_ltks"] = rag_tokenizer.tokenize(payload)
+                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                chunks = [ck]
+            else:
+                chunks = from_upstream.json_result
+                for i, ck in enumerate(chunks):
+                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
+                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
+                    ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
+                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                    if i % 100 == 99:
+                        self.callback(i * 1.0 / len(chunks) / parts)
+
+            self.callback(1.0 / parts, "Finish tokenizing.")
+
+        if "embedding" in self._param.search_method:
+            self.callback(random.randint(1, 5) / 100.0 + 0.5 * (parts - 1), "Start embedding inference.")
+
+            if from_upstream.name.strip() == "":
+                logging.warning("Tokenizer: empty name provided from upstream, embedding may be not accurate.")
+
+            chunks, token_count = await self._embedding(from_upstream.name, chunks)
+            self.set_output("embedding_token_consumption", token_count)
+
+            self.callback(1.0, "Finish embedding.")
+
+        self.set_output("chunks", chunks)