将flask改成fastapi

2025-10-13 13:18:03 +08:00
commit 88db2539b0
476 changed files with 739741 additions and 0 deletions
--- a/rag/app/init.py
+++ b/rag/app/init.py
@@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
--- a/rag/app/audio.py
+++ b/rag/app/audio.py
@@ -0,0 +1,61 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import re
+import tempfile
+
+from api.db import LLMType
+from api.db.services.llm_service import LLMBundle
+from rag.nlp import rag_tokenizer, tokenize
+
+
+def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
+    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+
+    # is it English
+    eng = lang.lower() == "english"  # is_english(sections)
+    try:
+        _, ext = os.path.splitext(filename)
+        if not ext:
+            raise RuntimeError("No extension detected.")
+
+        if ext not in [".da", ".wave", ".wav", ".mp3", ".wav", ".aac", ".flac", ".ogg", ".aiff", ".au", ".midi", ".wma", ".realaudio", ".vqf", ".oggvorbis", ".aac", ".ape"]:
+            raise RuntimeError(f"Extension {ext} is not supported yet.")
+
+        tmp_path = ""
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmpf:
+            tmpf.write(binary)
+            tmpf.flush()
+            tmp_path = os.path.abspath(tmpf.name)
+
+        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
+        seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
+        ans = seq2txt_mdl.transcription(tmp_path)
+        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
+
+        tokenize(doc, ans, eng)
+        return [doc]
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+    finally:
+        if tmp_path and os.path.exists(tmp_path):
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass
+    return []
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -0,0 +1,160 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from tika import parser
+import re
+from io import BytesIO
+
+from deepdoc.parser.utils import get_text
+from rag.nlp import bullets_category, is_english,remove_contents_table, \
+    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
+    tokenize_chunks
+from rag.nlp import rag_tokenizer
+from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback)
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts: {}".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._naive_vertical_merge()
+        self._filter_forpages()
+        self._merge_with_same_bullet()
+        callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
+
+        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
+                for b in self.boxes], tbls
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+        Since a book is long and not all the parts are useful, if it's a PDF,
+        please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    pdf_parser = None
+    sections, tbls = [], []
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        doc_parser = DocxParser()
+        # TODO: table of contents need to be removed
+        sections, tbls = doc_parser(
+            binary if binary else filename, from_page=from_page, to_page=to_page)
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        tbls = [((None, lns), None) for lns in tbls]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+        sections, tbls = pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)
+
+    elif re.search(r"\.txt$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        sections = txt.split("\n")
+        sections = [(line, "") for line in sections if line]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [(line, "") for line in sections if line]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [(line, "") for line in sections if line]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(doc, docx, pdf, txt supported)")
+
+    make_colon_as_title(sections)
+    bull = bullets_category(
+        [t for t in random_choices([t for t, _ in sections], k=100)])
+    if bull >= 0:
+        chunks = ["\n".join(ck)
+                  for ck in hierarchical_merge(bull, sections, 5)]
+    else:
+        sections = [s.split("@") for s, _ in sections]
+        sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
+        chunks = naive_merge(
+            sections, kwargs.get(
+                "chunk_token_num", 256), kwargs.get(
+                "delimer", "\n。；！？"))
+
+    # is it English
+    # is_english(random_choices([t for t, _ in sections], k=218))
+    eng = lang.lower() == "english"
+
+    res = tokenize_table(tbls, doc, eng)
+    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+
+    return res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
--- a/rag/app/email.py
+++ b/rag/app/email.py
@@ -0,0 +1,117 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from email import policy
+from email.parser import BytesParser
+from rag.app.naive import chunk as naive_chunk
+import re
+from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
+from deepdoc.parser import HtmlParser, TxtParser
+from timeit import default_timer as timer
+import io
+
+
+def chunk(
+    filename,
+    binary=None,
+    from_page=0,
+    to_page=100000,
+    lang="Chinese",
+    callback=None,
+    **kwargs,
+):
+    """
+    Only eml is supported
+    """
+    eng = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config",
+        {"chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"},
+    )
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    main_res = []
+    attachment_res = []
+
+    if binary:
+        msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
+    else:
+        msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
+
+    text_txt, html_txt = [], []
+    # get the email header info
+    for header, value in msg.items():
+        text_txt.append(f"{header}: {value}")
+
+    #  get the email main info
+    def _add_content(msg, content_type):
+        if content_type == "text/plain":
+            text_txt.append(
+                msg.get_payload(decode=True).decode(msg.get_content_charset())
+            )
+        elif content_type == "text/html":
+            html_txt.append(
+                msg.get_payload(decode=True).decode(msg.get_content_charset())
+            )
+        elif "multipart" in content_type:
+            if msg.is_multipart():
+                for part in msg.iter_parts():
+                    _add_content(part, part.get_content_type())
+
+    _add_content(msg, msg.get_content_type())
+
+    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
+        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt), chunk_token_num=parser_config["chunk_token_num"]) if line
+    ]
+
+    st = timer()
+    chunks = naive_merge(
+        sections,
+        int(parser_config.get("chunk_token_num", 128)),
+        parser_config.get("delimiter", "\n!?。；！？"),
+    )
+
+    main_res.extend(tokenize_chunks(chunks, doc, eng, None))
+    logging.debug("naive_merge({}): {}".format(filename, timer() - st))
+    # get the attachment info
+    for part in msg.iter_attachments():
+        content_disposition = part.get("Content-Disposition")
+        if content_disposition:
+            dispositions = content_disposition.strip().split(";")
+            if dispositions[0].lower() == "attachment":
+                filename = part.get_filename()
+                payload = part.get_payload(decode=True)
+                try:
+                    attachment_res.extend(
+                        naive_chunk(filename, payload, callback=callback, **kwargs)
+                    )
+                except Exception:
+                    pass
+
+    return main_res + attachment_res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -0,0 +1,213 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from tika import parser
+import re
+from io import BytesIO
+from docx import Document
+
+from api.db import ParserType
+from deepdoc.parser.utils import get_text
+from rag.nlp import bullets_category, remove_contents_table, \
+    make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
+from rag.nlp import rag_tokenizer, Node
+from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
+
+
+
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def old_call(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        return [line for line in lines if line]
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+            self.doc = Document(
+                filename) if not binary else Document(BytesIO(binary))
+            pn = 0
+            lines = []
+            level_set = set()
+            bull = bullets_category([p.text for p in self.doc.paragraphs])
+            for p in self.doc.paragraphs:
+                if pn > to_page:
+                    break
+                question_level, p_text = docx_question_level(p, bull)
+                if not p_text.strip("\n"):
+                    continue
+                lines.append((question_level, p_text))
+                level_set.add(question_level)
+                for run in p.runs:
+                    if 'lastRenderedPageBreak' in run._element.xml:
+                        pn += 1
+                        continue
+                    if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                        pn += 1
+
+            sorted_levels = sorted(level_set)
+
+            h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1
+            h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level
+
+            root = Node(level=0, depth=h2_level, texts=[])
+            root.build_tree(lines)
+
+            return [("\n").join(element) for element in root.get_tree() if element]
+
+
+    def __str__(self) -> str:
+        return f'''
+            question:{self.question},
+            answer:{self.answer},
+            level:{self.level},
+            childs:{self.childs}
+        '''
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        self.model_speciess = ParserType.LAWS.value
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts:".format(
+            ))
+        self._naive_vertical_merge()
+
+        callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
+
+        return [(b["text"], self._line_tag(b, zoomin))
+                for b in self.boxes], None
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, txt.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    pdf_parser = None
+    sections = []
+    # is it English
+    eng = lang.lower() == "english"  # is_english(sections)
+
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        chunks = Docx()(filename, binary)
+        callback(0.7, "Finish parsing.")
+        return tokenize_chunks(chunks, doc, eng, None)
+    
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+        for txt, poss in pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)[0]:
+            sections.append(txt + poss)
+
+    elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        sections = txt.split("\n")
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(doc, docx, pdf, txt supported)")
+
+
+    # Remove 'Contents' part
+    remove_contents_table(sections, eng)
+
+    make_colon_as_title(sections)
+    bull = bullets_category(sections)
+    res = tree_merge(bull, sections, 2)
+
+
+    if not res:
+        callback(0.99, "No chunk parsed out.")
+
+    return tokenize_chunks(res, doc, eng, pdf_parser)
+
+    # chunks = hierarchical_merge(bull, sections, 5)
+    #     return tokenize_chunks(["\n".join(ck)for ck in chunks], doc, eng, pdf_parser)
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -0,0 +1,285 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import copy
+import re
+
+from api.db import ParserType
+from io import BytesIO
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
+from rag.utils import num_tokens_from_string
+from deepdoc.parser import PdfParser, PlainParser, DocxParser
+from docx import Document
+from PIL import Image
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        self.model_speciess = ParserType.MANUAL.value
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+        logging.debug("OCR: {}".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts: {}".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._concat_downward()
+        self._filter_forpages()
+        callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
+
+        # clean mess
+        for b in self.boxes:
+            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
+
+        return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
+                for i, b in enumerate(self.boxes)], tbls
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        img = img[0]
+        embed = img.xpath('.//a:blip/@r:embed')[0]
+        related_part = document.part.related_parts[embed]
+        image = related_part.image
+        image = Image.open(BytesIO(image.blob))
+        return image
+
+    def concat_img(self, img1, img2):
+        if img1 and not img2:
+            return img1
+        if not img1 and img2:
+            return img2
+        if not img1 and not img2:
+            return None
+        width1, height1 = img1.size
+        width2, height2 = img2.size
+
+        new_width = max(width1, width2)
+        new_height = height1 + height2
+        new_image = Image.new('RGB', (new_width, new_height))
+
+        new_image.paste(img1, (0, 0))
+        new_image.paste(img2, (0, height1))
+
+        return new_image
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        last_answer, last_image = "", None
+        question_stack, level_stack = [], []
+        ti_list = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            question_level, p_text = 0, ''
+            if from_page <= pn < to_page and p.text.strip():
+                question_level, p_text = docx_question_level(p)
+            if not question_level or question_level > 6: # not a question
+                last_answer = f'{last_answer}\n{p_text}'
+                current_image = self.get_picture(self.doc, p)
+                last_image = self.concat_img(last_image, current_image)
+            else:   # is a question
+                if last_answer or last_image:
+                    sum_question = '\n'.join(question_stack)
+                    if sum_question:
+                        ti_list.append((f'{sum_question}\n{last_answer}', last_image))
+                    last_answer, last_image = '', None
+
+                i = question_level
+                while question_stack and i <= level_stack[-1]:
+                    question_stack.pop()
+                    level_stack.pop()
+                question_stack.append(p_text)
+                level_stack.append(question_level)
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        if last_answer:
+            sum_question = '\n'.join(question_stack)
+            if sum_question:
+                ti_list.append((f'{sum_question}\n{last_answer}', last_image))
+                
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                        else:
+                            break
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return ti_list, tbls
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Only pdf is supported.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    pdf_parser = None
+    doc = {
+        "docnm_kwd": filename
+    }
+    doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    # is it English
+    eng = lang.lower() == "english"  # pdf_parser.is_english
+    if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+        sections, tbls = pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)
+        if sections and len(sections[0]) < 3:
+            sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
+        # set pivot using the most frequent type of title,
+        # then merge between 2 pivot
+        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
+            max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
+            most_level = max(0, max_lvl - 1)
+            levels = []
+            for txt, _, _ in sections:
+                for t, lvl in pdf_parser.outlines:
+                    tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
+                    tks_ = set([txt[i] + txt[i + 1]
+                                for i in range(min(len(t), len(txt) - 1))])
+                    if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
+                        levels.append(lvl)
+                        break
+                else:
+                    levels.append(max_lvl + 1)
+
+        else:
+            bull = bullets_category([txt for txt, _, _ in sections])
+            most_level, levels = title_frequency(
+                bull, [(txt, lvl) for txt, lvl, _ in sections])
+
+        assert len(sections) == len(levels)
+        sec_ids = []
+        sid = 0
+        for i, lvl in enumerate(levels):
+            if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
+                sid += 1
+            sec_ids.append(sid)
+
+        sections = [(txt, sec_ids[i], poss)
+                    for i, (txt, _, poss) in enumerate(sections)]
+        for (img, rows), poss in tbls:
+            if not rows:
+                continue
+            sections.append((rows if isinstance(rows, str) else rows[0], -1,
+                            [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
+
+        def tag(pn, left, right, top, bottom):
+            if pn + left + right + top + bottom == 0:
+                return ""
+            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
+                .format(pn, left, right, top, bottom)
+
+        chunks = []
+        last_sid = -2
+        tk_cnt = 0
+        for txt, sec_id, poss in sorted(sections, key=lambda x: (
+                x[-1][0][0], x[-1][0][3], x[-1][0][1])):
+            poss = "\t".join([tag(*pos) for pos in poss])
+            if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
+                if chunks:
+                    chunks[-1] += "\n" + txt + poss
+                    tk_cnt += num_tokens_from_string(txt)
+                    continue
+            chunks.append(txt + poss)
+            tk_cnt = num_tokens_from_string(txt)
+            if sec_id > -1:
+                last_sid = sec_id
+
+        res = tokenize_table(tbls, doc, eng)
+        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+        return res
+
+    elif re.search(r"\.docx?$", filename, re.IGNORECASE):
+        docx_parser = Docx()
+        ti_list, tbls = docx_parser(filename, binary,
+                                    from_page=0, to_page=10000, callback=callback)
+        res = tokenize_table(tbls, doc, eng)
+        for text, image in ti_list:
+            d = copy.deepcopy(doc)
+            if image:
+                d['image'] = image
+                d["doc_type_kwd"] = "image"
+            tokenize(d, text, eng)
+            res.append(d)
+        return res
+    else:
+        raise NotImplementedError("file type not supported yet(pdf and docx supported)")
+    
+
+if __name__ == "__main__":
+    import sys
+
+
+    def dummy(prog=None, msg=""):
+        pass
+
+
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -0,0 +1,603 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import re
+from functools import reduce
+from io import BytesIO
+from timeit import default_timer as timer
+
+from docx import Document
+from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
+from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
+from docx.opc.oxml import parse_xml
+from markdown import markdown
+from PIL import Image
+from tika import parser
+
+from api.db import LLMType
+from api.db.services.llm_service import LLMBundle
+from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
+from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
+from deepdoc.parser.pdf_parser import PlainParser, VisionParser
+from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        imgs = paragraph._element.xpath('.//pic:pic')
+        if not imgs:
+            return None
+        res_img = None
+        for img in imgs:
+            embed = img.xpath('.//a:blip/@r:embed')
+            if not embed:
+                continue
+            embed = embed[0]
+            try:
+                related_part = document.part.related_parts[embed]
+                image_blob = related_part.image.blob
+            except UnrecognizedImageError:
+                logging.info("Unrecognized image format. Skipping image.")
+                continue
+            except UnexpectedEndOfFileError:
+                logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
+                continue
+            except InvalidImageStreamError:
+                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+                continue
+            except UnicodeDecodeError:
+                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+                continue
+            except Exception:
+                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+                continue
+            try:
+                image = Image.open(BytesIO(image_blob)).convert('RGB')
+                if res_img is None:
+                    res_img = image
+                else:
+                    res_img = concat_img(res_img, image)
+            except Exception:
+                continue
+
+        return res_img
+
+    def __clean(self, line):
+        line = re.sub(r"\u3000", " ", line).strip()
+        return line
+
+    def __get_nearest_title(self, table_index, filename):
+        """Get the hierarchical title structure before the table"""
+        import re
+        from docx.text.paragraph import Paragraph
+
+        titles = []
+        blocks = []
+
+        # Get document name from filename parameter
+        doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
+        if not doc_name:
+            doc_name = "Untitled Document"
+
+        # Collect all document blocks while maintaining document order
+        try:
+            # Iterate through all paragraphs and tables in document order
+            for i, block in enumerate(self.doc._element.body):
+                if block.tag.endswith('p'):  # Paragraph
+                    p = Paragraph(block, self.doc)
+                    blocks.append(('p', i, p))
+                elif block.tag.endswith('tbl'):  # Table
+                    blocks.append(('t', i, None))  # Table object will be retrieved later
+        except Exception as e:
+            logging.error(f"Error collecting blocks: {e}")
+            return ""
+
+        # Find the target table position
+        target_table_pos = -1
+        table_count = 0
+        for i, (block_type, pos, _) in enumerate(blocks):
+            if block_type == 't':
+                if table_count == table_index:
+                    target_table_pos = pos
+                    break
+                table_count += 1
+
+        if target_table_pos == -1:
+            return ""  # Target table not found
+
+        # Find the nearest heading paragraph in reverse order
+        nearest_title = None
+        for i in range(len(blocks)-1, -1, -1):
+            block_type, pos, block = blocks[i]
+            if pos >= target_table_pos:  # Skip blocks after the table
+                continue
+
+            if block_type != 'p':
+                continue
+
+            if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
+                try:
+                    level_match = re.search(r"(\d+)", block.style.name)
+                    if level_match:
+                        level = int(level_match.group(1))
+                        if level <= 7:  # Support up to 7 heading levels
+                            title_text = block.text.strip()
+                            if title_text:  # Avoid empty titles
+                                nearest_title = (level, title_text)
+                                break
+                except Exception as e:
+                    logging.error(f"Error parsing heading level: {e}")
+
+        if nearest_title:
+            # Add current title
+            titles.append(nearest_title)
+            current_level = nearest_title[0]
+
+            # Find all parent headings, allowing cross-level search
+            while current_level > 1:
+                found = False
+                for i in range(len(blocks)-1, -1, -1):
+                    block_type, pos, block = blocks[i]
+                    if pos >= target_table_pos:  # Skip blocks after the table
+                        continue
+
+                    if block_type != 'p':
+                        continue
+
+                    if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
+                        try:
+                            level_match = re.search(r"(\d+)", block.style.name)
+                            if level_match:
+                                level = int(level_match.group(1))
+                                # Find any heading with a higher level
+                                if level < current_level:
+                                    title_text = block.text.strip()
+                                    if title_text:  # Avoid empty titles
+                                        titles.append((level, title_text))
+                                        current_level = level
+                                        found = True
+                                        break
+                        except Exception as e:
+                            logging.error(f"Error parsing parent heading: {e}")
+
+                if not found:  # Break if no parent heading is found
+                    break
+
+            # Sort by level (ascending, from highest to lowest)
+            titles.sort(key=lambda x: x[0])
+            # Organize titles (from highest to lowest)
+            hierarchy = [doc_name] + [t[1] for t in titles]
+            return " > ".join(hierarchy)
+
+        return ""
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        lines = []
+        last_image = None
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page:
+                if p.text.strip():
+                    if p.style and p.style.name == 'Caption':
+                        former_image = None
+                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
+                            former_image = lines[-1][1].pop()
+                        elif last_image:
+                            former_image = last_image
+                            last_image = None
+                        lines.append((self.__clean(p.text), [former_image], p.style.name))
+                    else:
+                        current_image = self.get_picture(self.doc, p)
+                        image_list = [current_image]
+                        if last_image:
+                            image_list.insert(0, last_image)
+                            last_image = None
+                        lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
+                else:
+                    if current_image := self.get_picture(self.doc, p):
+                        if lines:
+                            lines[-1][1].append(current_image)
+                        else:
+                            last_image = current_image
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
+
+        tbls = []
+        for i, tb in enumerate(self.doc.tables):
+            title = self.__get_nearest_title(i, filename)
+            html = "<table>"
+            if title:
+                html += f"<caption>Table Location: {title}</caption>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                try:
+                    while i < len(r.cells):
+                        span = 1
+                        c = r.cells[i]
+                        for j in range(i + 1, len(r.cells)):
+                            if c.text == r.cells[j].text:
+                                span += 1
+                                i = j
+                            else:
+                                break
+                        i += 1
+                        html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                except Exception as e:
+                    logging.warning(f"Error parsing table, ignore: {e}")
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return new_line, tbls
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
+        start = timer()
+        first_start = start
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+        logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
+
+        if separate_tables_figures:
+            tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
+            self._concat_downward()
+            logging.info("layouts cost: {}s".format(timer() - first_start))
+            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
+        else:
+            tbls = self._extract_table_figure(True, zoomin, True, True)
+            self._naive_vertical_merge()
+            self._concat_downward()
+            # self._filter_forpages()
+            logging.info("layouts cost: {}s".format(timer() - first_start))
+            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
+
+
+class Markdown(MarkdownParser):
+    def get_picture_urls(self, sections):
+        if not sections:
+            return []
+        if isinstance(sections, type("")):
+            text = sections
+        elif isinstance(sections[0], type("")):
+            text = sections[0]
+        else:
+            return []
+
+        from bs4 import BeautifulSoup
+        html_content = markdown(text)
+        soup = BeautifulSoup(html_content, 'html.parser')
+        html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
+        return html_images
+
+    def get_pictures(self, text):
+        """Download and open all images from markdown text."""
+        import requests
+        image_urls = self.get_picture_urls(text)
+        images = []
+        # Find all image URLs in text
+        for url in image_urls:
+            try:
+                # check if the url is a local file or a remote URL
+                if url.startswith(('http://', 'https://')):
+                    # For remote URLs, download the image
+                    response = requests.get(url, stream=True, timeout=30)
+                    if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
+                        img = Image.open(BytesIO(response.content)).convert('RGB')
+                        images.append(img)
+                else:
+                    # For local file paths, open the image directly
+                    from pathlib import Path
+                    local_path = Path(url)
+                    if not local_path.exists():
+                        logging.warning(f"Local image file not found: {url}")
+                        continue
+                    img = Image.open(url).convert('RGB')
+                    images.append(img)
+            except Exception as e:
+                logging.error(f"Failed to download/open image from {url}: {e}")
+                continue
+
+        return images if images else None
+
+    def __call__(self, filename, binary=None, separate_tables=True):
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(filename, "r") as f:
+                txt = f.read()
+
+        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
+
+        extractor = MarkdownElementExtractor(txt)
+        element_sections = extractor.extract_elements()
+        sections = [(element, "") for element in element_sections]
+
+        tbls = []
+        for table in tables:
+            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
+        return sections, tbls
+
+def load_from_xml_v2(baseURI, rels_item_xml):
+    """
+    Return |_SerializedRelationships| instance loaded with the
+    relationships contained in *rels_item_xml*. Returns an empty
+    collection if *rels_item_xml* is |None|.
+    """
+    srels = _SerializedRelationships()
+    if rels_item_xml is not None:
+        rels_elm = parse_xml(rels_item_xml)
+        for rel_elm in rels_elm.Relationship_lst:
+            if rel_elm.target_ref in ('../NULL', 'NULL'):
+                continue
+            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
+    return srels
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, excel, txt.
+        This method apply the naive ways to chunk files.
+        Successive text will be sliced into pieces using 'delimiter'.
+        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
+    """
+
+    is_english = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    res = []
+    pdf_parser = None
+    section_images = None
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+
+        try:
+            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
+            callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
+        except Exception:
+            vision_model = None
+
+        # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
+        _SerializedRelationships.load_from_xml = load_from_xml_v2
+        sections, tables = Docx()(filename, binary)
+
+        if vision_model:
+            figures_data = vision_figure_parser_figure_data_wrapper(sections)
+            try:
+                docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
+                boosted_figures = docx_vision_parser(callback=callback)
+                tables.extend(boosted_figures)
+            except Exception as e:
+                callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
+
+        res = tokenize_table(tables, doc, is_english)
+        callback(0.8, "Finish parsing.")
+
+        st = timer()
+
+        chunks, images = naive_merge_docx(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+
+        if kwargs.get("section_only", False):
+            return chunks
+
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        logging.info("naive_merge({}): {}".format(filename, timer() - st))
+        return res
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+        callback(0.1, "Start to parse.")
+
+        if layout_recognizer == "DeepDOC":
+            pdf_parser = Pdf()
+
+            try:
+                vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
+                callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
+            except Exception:
+                vision_model = None
+
+            if vision_model:
+                sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
+                callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
+                try:
+                    pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
+                    boosted_figures = pdf_vision_parser(callback=callback)
+                    tables.extend(boosted_figures)
+                except Exception as e:
+                    callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
+                    tables.extend(figures)
+            else:
+                sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
+
+            res = tokenize_table(tables, doc, is_english)
+            callback(0.8, "Finish parsing.")
+
+        else:
+            if layout_recognizer == "Plain Text":
+                pdf_parser = PlainParser()
+            else:
+                vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
+                pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
+
+            sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
+                                          callback=callback)
+            res = tokenize_table(tables, doc, is_english)
+            callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        if parser_config.get("html4excel"):
+            sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+        else:
+            sections = [(_, "") for _ in excel_parser(binary) if _]
+        parser_config["chunk_token_num"] = 12800
+
+    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = TxtParser()(filename, binary,
+                               parser_config.get("chunk_token_num", 128),
+                               parser_config.get("delimiter", "\n!?;。；！？"))
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
+        sections, tables = markdown_parser(filename, binary, separate_tables=False)
+
+        try:
+            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
+            callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
+        except Exception:
+            vision_model = None
+        
+        if vision_model:
+            # Process images for each section
+            section_images = []
+            for idx, (section_text, _) in enumerate(sections):
+                images = markdown_parser.get_pictures(section_text) if section_text else None
+
+                if images:
+                    # If multiple images found, combine them using concat_img
+                    combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
+                    section_images.append(combined_image)
+                    markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
+                    boosted_figures = markdown_vision_parser(callback=callback)
+                    sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
+                else:
+                    section_images.append(None)
+        else:
+            logging.warning("No visual model detected. Skipping figure parsing enhancement.")
+
+        res = tokenize_table(tables, doc, is_english)
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        chunk_token_num = int(parser_config.get("chunk_token_num", 128))
+        sections = HtmlParser()(filename, binary, chunk_token_num)
+        sections = [(_, "") for _ in sections if _]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        chunk_token_num = int(parser_config.get("chunk_token_num", 128))
+        sections = JsonParser(chunk_token_num)(binary)
+        sections = [(_, "") for _ in sections if _]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        if doc_parsed.get('content', None) is not None:
+            sections = doc_parsed['content'].split('\n')
+            sections = [(_, "") for _ in sections if _]
+            callback(0.8, "Finish parsing.")
+        else:
+            callback(0.8, f"tika.parser got empty content from {filename}.")
+            logging.warning(f"tika.parser got empty content from {filename}.")
+            return []
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
+
+    st = timer()
+    if section_images:
+        # if all images are None, set section_images to None
+        if all(image is None for image in section_images):
+            section_images = None
+
+    if section_images:
+        chunks, images = naive_merge_with_images(sections, section_images,
+                                        int(parser_config.get(
+                                            "chunk_token_num", 128)), parser_config.get(
+                                            "delimiter", "\n!?。；！？"))
+        if kwargs.get("section_only", False):
+            return chunks
+
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+    else:
+        chunks = naive_merge(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+        if kwargs.get("section_only", False):
+            return chunks
+
+        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+
+    logging.info("naive_merge({}): {}".format(filename, timer() - st))
+    return res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -0,0 +1,141 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from tika import parser
+from io import BytesIO
+import re
+
+from deepdoc.parser.utils import get_text
+from rag.app import naive
+from rag.nlp import rag_tokenizer, tokenize
+from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin, drop=False)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug("layouts cost: {}s".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._concat_downward()
+
+        sections = [(b["text"], self.get_position(b, zoomin))
+                    for i, b in enumerate(self.boxes)]
+        for (img, rows), poss in tbls:
+            if not rows:
+                continue
+            sections.append((rows if isinstance(rows, str) else rows[0],
+                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
+        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
+            x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, excel, txt.
+        One file forms a chunk which maintains original text order.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    eng = lang.lower() == "english"  # is_english(cks)
+
+    if re.search(r"\.docx$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections, tbls = naive.Docx()(filename, binary)
+        sections = [s for s, _ in sections if s]
+        for (_, html), _ in tbls:
+            sections.append(html)
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+        sections, _ = pdf_parser(
+            filename if not binary else binary, to_page=to_page, callback=callback)
+        sections = [s for s, _ in sections if s]
+
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = excel_parser.html(binary, 1000000000)
+
+    elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        sections = txt.split("\n")
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = HtmlParser()(filename, binary)
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    elif re.search(r"\.doc$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        binary = BytesIO(binary)
+        doc_parsed = parser.from_buffer(binary)
+        sections = doc_parsed['content'].split('\n')
+        sections = [s for s in sections if s]
+        callback(0.8, "Finish parsing.")
+
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(doc, docx, pdf, txt supported)")
+
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    tokenize(doc, "\n".join(sections), eng)
+    return [doc]
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -0,0 +1,297 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import copy
+import re
+
+from api.db import ParserType
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
+from deepdoc.parser import PdfParser, PlainParser
+import numpy as np
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        self.model_speciess = ParserType.PAPER.value
+        super().__init__()
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+        logging.debug(f"layouts cost: {timer() - start}s")
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
+        self._concat_downward()
+        self._filter_forpages()
+        callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
+
+        # clean mess
+        if column_width < self.page_images[0].size[0] / zoomin / 2:
+            logging.debug("two_column................... {} {}".format(column_width,
+                  self.page_images[0].size[0] / zoomin / 2))
+            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
+        for b in self.boxes:
+            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
+
+        def _begin(txt):
+            return re.match(
+                "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
+                txt.lower().strip())
+
+        if from_page > 0:
+            return {
+                "title": "",
+                "authors": "",
+                "abstract": "",
+                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
+                             re.match(r"(text|title)", b.get("layoutno", "text"))],
+                "tables": tbls
+            }
+        # get title and authors
+        title = ""
+        authors = []
+        i = 0
+        while i < min(32, len(self.boxes)-1):
+            b = self.boxes[i]
+            i += 1
+            if b.get("layoutno", "").find("title") >= 0:
+                title = b["text"]
+                if _begin(title):
+                    title = ""
+                    break
+                for j in range(3):
+                    if _begin(self.boxes[i + j]["text"]):
+                        break
+                    authors.append(self.boxes[i + j]["text"])
+                    break
+                break
+        # get abstract
+        abstr = ""
+        i = 0
+        while i + 1 < min(32, len(self.boxes)):
+            b = self.boxes[i]
+            i += 1
+            txt = b["text"].lower().strip()
+            if re.match("(abstract|摘要)", txt):
+                if len(txt.split()) > 32 or len(txt) > 64:
+                    abstr = txt + self._line_tag(b, zoomin)
+                    break
+                txt = self.boxes[i]["text"].lower().strip()
+                if len(txt.split()) > 32 or len(txt) > 64:
+                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
+                i += 1
+                break
+        if not abstr:
+            i = 0
+
+        callback(
+            0.8, "Page {}~{}: Text merging finished".format(
+                from_page, min(
+                    to_page, self.total_page)))
+        for b in self.boxes:
+            logging.debug("{} {}".format(b["text"], b.get("layoutno")))
+        logging.debug("{}".format(tbls))
+
+        return {
+            "title": title,
+            "authors": " ".join(authors),
+            "abstract": abstr,
+            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
+                         re.match(r"(text|title)", b.get("layoutno", "text"))],
+            "tables": tbls
+        }
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
+    """
+        Only pdf is supported.
+        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
+    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
+    if re.search(r"\.pdf$", filename, re.IGNORECASE):
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+            paper = {
+                "title": filename,
+                "authors": " ",
+                "abstract": "",
+                "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
+                "tables": []
+            }
+        else:
+            pdf_parser = Pdf()
+            paper = pdf_parser(filename if not binary else binary,
+                               from_page=from_page, to_page=to_page, callback=callback)
+    else:
+        raise NotImplementedError("file type not supported yet(pdf supported)")
+
+    doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
+           "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
+    # is it English
+    eng = lang.lower() == "english"  # pdf_parser.is_english
+    logging.debug("It's English.....{}".format(eng))
+
+    res = tokenize_table(paper["tables"], doc, eng)
+
+    if paper["abstract"]:
+        d = copy.deepcopy(doc)
+        txt = pdf_parser.remove_tag(paper["abstract"])
+        d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
+        d["important_tks"] = " ".join(d["important_kwd"])
+        d["image"], poss = pdf_parser.crop(
+            paper["abstract"], need_position=True)
+        add_positions(d, poss)
+        tokenize(d, txt, eng)
+        res.append(d)
+
+    sorted_sections = paper["sections"]
+    # set pivot using the most frequent type of title,
+    # then merge between 2 pivot
+    bull = bullets_category([txt for txt, _ in sorted_sections])
+    most_level, levels = title_frequency(bull, sorted_sections)
+    assert len(sorted_sections) == len(levels)
+    sec_ids = []
+    sid = 0
+    for i, lvl in enumerate(levels):
+        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
+            sid += 1
+        sec_ids.append(sid)
+        logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
+
+    chunks = []
+    last_sid = -2
+    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
+        if sec_id == last_sid:
+            if chunks:
+                chunks[-1] += "\n" + txt
+                continue
+        chunks.append(txt)
+        last_sid = sec_id
+    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+    return res
+
+
+"""
+    readed = [0] * len(paper["lines"])
+    # find colon firstly
+    i = 0
+    while i + 1 < len(paper["lines"]):
+        txt = pdf_parser.remove_tag(paper["lines"][i][0])
+        j = i
+        if txt.strip("\n").strip()[-1] not in ":：":
+            i += 1
+            continue
+        i += 1
+        while i < len(paper["lines"]) and not paper["lines"][i][0]:
+            i += 1
+        if i >= len(paper["lines"]): break
+        proj = [paper["lines"][i][0].strip()]
+        i += 1
+        while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
+            proj.append(paper["lines"][i])
+            i += 1
+        for k in range(j, i): readed[k] = True
+        txt = txt[::-1]
+        if eng:
+            r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
+            txt = r.group(1)[::-1] if r else txt[::-1]
+        else:
+            r = re.search(r"(.*?) ([。？；！]|$)", txt)
+            txt = r.group(1)[::-1] if r else txt[::-1]
+        for p in proj:
+            d = copy.deepcopy(doc)
+            txt += "\n" + pdf_parser.remove_tag(p)
+            d["image"], poss = pdf_parser.crop(p, need_position=True)
+            add_positions(d, poss)
+            tokenize(d, txt, eng)
+            res.append(d)
+
+    i = 0
+    chunk = []
+    tk_cnt = 0
+    def add_chunk():
+        nonlocal chunk, res, doc, pdf_parser, tk_cnt
+        d = copy.deepcopy(doc)
+        ck = "\n".join(chunk)
+        tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
+        d["image"], poss = pdf_parser.crop(ck, need_position=True)
+        add_positions(d, poss)
+        res.append(d)
+        chunk = []
+        tk_cnt = 0
+
+    while i < len(paper["lines"]):
+        if tk_cnt > 128:
+            add_chunk()
+        if readed[i]:
+            i += 1
+            continue
+        readed[i] = True
+        txt, layouts = paper["lines"][i]
+        txt_ = pdf_parser.remove_tag(txt)
+        i += 1
+        cnt = num_tokens_from_string(txt_)
+        if any([
+            layouts.find("title") >= 0 and chunk,
+            cnt + tk_cnt > 128 and tk_cnt > 32,
+        ]):
+            add_chunk()
+            chunk = [txt]
+            tk_cnt = cnt
+        else:
+            chunk.append(txt)
+            tk_cnt += cnt
+
+    if chunk: add_chunk()
+    for i, d in enumerate(res):
+        print(d)
+        # d["image"].save(f"./logs/{i}.jpg")
+    return res
+"""
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -0,0 +1,91 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import io
+import re
+
+import numpy as np
+from PIL import Image
+
+from api.db import LLMType
+from api.db.services.llm_service import LLMBundle
+from deepdoc.vision import OCR
+from rag.nlp import tokenize
+from rag.utils import clean_markdown_block
+from rag.nlp import rag_tokenizer
+
+
+ocr = OCR()
+
+
+def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
+    img = Image.open(io.BytesIO(binary)).convert('RGB')
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
+        "image": img,
+        "doc_type_kwd": "image"
+    }
+    bxs = ocr(np.array(img))
+    txt = "\n".join([t[0] for _, t in bxs if t[0]])
+    eng = lang.lower() == "english"
+    callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
+    if (eng and len(txt.split()) > 32) or len(txt) > 32:
+        tokenize(doc, txt, eng)
+        callback(0.8, "OCR results is too long to use CV LLM.")
+        return [doc]
+
+    try:
+        callback(0.4, "Use CV LLM to describe the picture.")
+        cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
+        img_binary = io.BytesIO()
+        img.save(img_binary, format='JPEG')
+        img_binary.seek(0)
+        ans = cv_mdl.describe(img_binary.read())
+        callback(0.8, "CV LLM respond: %s ..." % ans[:32])
+        txt += "\n" + ans
+        tokenize(doc, txt, eng)
+        return [doc]
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+
+    return []
+
+
+def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
+    """
+    A simple wrapper to process image to markdown texts via VLM.
+
+    Returns:
+        Simple markdown texts generated by VLM.
+    """
+    callback = callback or (lambda prog, msg: None)
+
+    img = binary
+    txt = ""
+
+    try:
+        with io.BytesIO() as img_binary:
+            img.save(img_binary, format='JPEG')
+            img_binary.seek(0)
+            ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
+            txt += "\n" + ans
+            return txt
+
+    except Exception as e:
+        callback(-1, str(e))
+
+    return ""
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -0,0 +1,168 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import copy
+import re
+from io import BytesIO
+
+from PIL import Image
+
+from api.db import LLMType
+from api.db.services.llm_service import LLMBundle
+from deepdoc.parser.pdf_parser import VisionParser
+from rag.nlp import tokenize, is_english
+from rag.nlp import rag_tokenizer
+from deepdoc.parser import PdfParser, PptParser, PlainParser
+from PyPDF2 import PdfReader as pdf2_read
+
+
+class Ppt(PptParser):
+    def __call__(self, fnm, from_page, to_page, callback=None):
+        txts = super().__call__(fnm, from_page, to_page)
+
+        callback(0.5, "Text extraction finished.")
+        import aspose.slides as slides
+        import aspose.pydrawing as drawing
+        imgs = []
+        with slides.Presentation(BytesIO(fnm)) as presentation:
+            for i, slide in enumerate(presentation.slides[from_page: to_page]):
+                try:
+                    with BytesIO() as buffered:
+                        slide.get_thumbnail(
+                            0.1, 0.1).save(
+                            buffered, drawing.imaging.ImageFormat.jpeg)
+                        buffered.seek(0)
+                        imgs.append(Image.open(buffered).copy())
+                except RuntimeError as e:
+                    raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e
+        assert len(imgs) == len(
+            txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
+        callback(0.9, "Image extraction finished")
+        self.is_english = is_english(txts)
+        return [(txts[i], imgs[i]) for i in range(len(txts))]
+
+
+class Pdf(PdfParser):
+    def __init__(self):
+        super().__init__()
+
+    def __garbage(self, txt):
+        txt = txt.lower().strip()
+        if re.match(r"[0-9\.,%/-]+$", txt):
+            return True
+        if len(txt) < 3:
+            return True
+        return False
+
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        from timeit import default_timer as timer
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(filename if not binary else binary,
+                        zoomin, from_page, to_page, callback)
+        callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
+        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
+            len(self.boxes), len(self.page_images))
+        res = []
+        for i in range(len(self.boxes)):
+            lines = "\n".join([b["text"] for b in self.boxes[i]
+                              if not self.__garbage(b["text"])])
+            res.append((lines, self.page_images[i]))
+        callback(0.9, "Page {}~{}: Parsing finished".format(
+            from_page, min(to_page, self.total_page)))
+        return res
+
+
+class PlainPdf(PlainParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, callback=None, **kwargs):
+        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
+        page_txt = []
+        for page in self.pdf.pages[from_page: to_page]:
+            page_txt.append(page.extract_text())
+        callback(0.9, "Parsing finished")
+        return [(txt, None) for txt in page_txt]
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, parser_config=None, **kwargs):
+    """
+    The supported file formats are pdf, pptx.
+    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
+    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
+    """
+    if parser_config is None:
+        parser_config = {}
+    eng = lang.lower() == "english"
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    res = []
+    if re.search(r"\.pptx?$", filename, re.IGNORECASE):
+        ppt_parser = Ppt()
+        for pn, (txt, img) in enumerate(ppt_parser(
+                filename if not binary else binary, from_page, 1000000, callback)):
+            d = copy.deepcopy(doc)
+            pn += from_page
+            d["image"] = img
+            d["doc_type_kwd"] = "image"
+            d["page_num_int"] = [pn + 1]
+            d["top_int"] = [0]
+            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
+            tokenize(d, txt, eng)
+            res.append(d)
+        return res
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+        if layout_recognizer == "DeepDOC":
+            pdf_parser = Pdf()
+            sections = pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
+        elif layout_recognizer == "Plain Text":
+            pdf_parser = PlainParser()
+            sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
+                                      callback=callback)
+        else:
+            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
+            pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
+            sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
+                                      callback=callback)
+
+        callback(0.8, "Finish parsing.")
+        for pn, (txt, img) in enumerate(sections):
+            d = copy.deepcopy(doc)
+            pn += from_page
+            if img:
+                d["image"] = img
+            d["page_num_int"] = [pn + 1]
+            d["top_int"] = [0]
+            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
+            tokenize(d, txt, eng)
+            res.append(d)
+        return res
+
+    raise NotImplementedError(
+        "file type not supported yet(pptx, pdf supported)")
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(a, b):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -0,0 +1,471 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import re
+import csv
+from copy import deepcopy
+from io import BytesIO
+from timeit import default_timer as timer
+from openpyxl import load_workbook
+
+from deepdoc.parser.utils import get_text
+from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
+from rag.nlp import rag_tokenizer, tokenize_table, concat_img
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser
+from docx import Document
+from PIL import Image
+from markdown import markdown
+
+from rag.utils import get_float
+
+
+class Excel(ExcelParser):
+    def __call__(self, fnm, binary=None, callback=None):
+        if not binary:
+            wb = load_workbook(fnm)
+        else:
+            wb = load_workbook(BytesIO(binary))
+        total = 0
+        for sheetname in wb.sheetnames:
+            total += len(list(wb[sheetname].rows))
+
+        res, fails = [], []
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            for i, r in enumerate(rows):
+                q, a = "", ""
+                for cell in r:
+                    if not cell.value:
+                        continue
+                    if not q:
+                        q = str(cell.value)
+                    elif not a:
+                        a = str(cell.value)
+                    else:
+                        break
+                if q and a:
+                    res.append((q, a))
+                else:
+                    fails.append(str(i + 1))
+                if len(res) % 999 == 0:
+                    callback(len(res) *
+                             0.6 /
+                             total, ("Extract pairs: {}".format(len(res)) +
+                                     (f"{len(fails)} failure, line: %s..." %
+                                      (",".join(fails[:3])) if fails else "")))
+
+        callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        self.is_english = is_english(
+            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
+        return res
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        start = timer()
+        callback(msg="OCR started")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
+        logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
+        start = timer()
+        self._layouts_rec(zoomin, drop=False)
+        callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
+
+        start = timer()
+        self._text_merge()
+        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        #self._naive_vertical_merge()
+        # self._concat_downward()
+        #self._filter_forpages()
+        logging.debug("layouts: {}".format(timer() - start))
+        sections = [b["text"] for b in self.boxes]
+        bull_x0_list = []
+        q_bull, reg = qbullets_category(sections)
+        if q_bull == -1:
+            raise ValueError("Unable to recognize Q&A structure.")
+        qai_list = []
+        last_q, last_a, last_tag = '', '', ''
+        last_index = -1
+        last_box = {'text':''}
+        last_bull = None
+        def sort_key(element):
+            tbls_pn = element[1][0][0]
+            tbls_top = element[1][0][3]
+            return tbls_pn, tbls_top
+        tbls.sort(key=sort_key)
+        tbl_index = 0
+        last_pn, last_bottom = 0, 0
+        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
+        for box in self.boxes:
+            section, line_tag = box['text'], self._line_tag(box, zoomin)
+            has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
+            last_box, last_index, last_bull = box, index, has_bull
+            line_pn = get_float(line_tag.lstrip('@@').split('\t')[0])
+            line_top = get_float(line_tag.rstrip('##').split('\t')[3])
+            tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
+            if not has_bull:  # No question bullet
+                if not last_q:
+                    if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top):    # image passed
+                        tbl_index += 1
+                    continue
+                else:
+                    sum_tag = line_tag
+                    sum_section = section
+                    while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
+                        and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)):    # add image at the middle of current answer
+                        sum_tag = f'{tbl_tag}{sum_tag}'
+                        sum_section = f'{tbl_text}{sum_section}'
+                        tbl_index += 1
+                        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
+                    last_a = f'{last_a}{sum_section}'
+                    last_tag = f'{last_tag}{sum_tag}'
+            else:
+                if last_q:
+                    while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
+                        and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)):    # add image at the end of last answer
+                        last_tag = f'{last_tag}{tbl_tag}'
+                        last_a = f'{last_a}{tbl_text}'
+                        tbl_index += 1
+                        tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
+                    image, poss = self.crop(last_tag, need_position=True)
+                    qai_list.append((last_q, last_a, image, poss))
+                    last_q, last_a, last_tag = '', '', ''
+                last_q = has_bull.group()
+                _, end = has_bull.span()
+                last_a = section[end:]
+                last_tag = line_tag
+            last_bottom = float(line_tag.rstrip('##').split('\t')[4])
+            last_pn = line_pn
+        if last_q:
+            qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
+        return qai_list, tbls
+
+    def get_tbls_info(self, tbls, tbl_index):
+        if tbl_index >= len(tbls):
+            return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
+        tbl_pn = tbls[tbl_index][1][0][0]+1
+        tbl_left = tbls[tbl_index][1][0][1]
+        tbl_right = tbls[tbl_index][1][0][2]
+        tbl_top = tbls[tbl_index][1][0][3]
+        tbl_bottom = tbls[tbl_index][1][0][4]
+        tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
+            .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
+        _tbl_text = ''.join(tbls[tbl_index][0][1])
+        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, _tbl_text
+
+
+class Docx(DocxParser):
+    def __init__(self):
+        pass
+
+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        img = img[0]
+        embed = img.xpath('.//a:blip/@r:embed')[0]
+        related_part = document.part.related_parts[embed]
+        image = related_part.image
+        image = Image.open(BytesIO(image.blob)).convert('RGB')
+        return image
+
+    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
+        self.doc = Document(
+            filename) if not binary else Document(BytesIO(binary))
+        pn = 0
+        last_answer, last_image = "", None
+        question_stack, level_stack = [], []
+        qai_list = []
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+            question_level, p_text = 0, ''
+            if from_page <= pn < to_page and p.text.strip():
+                question_level, p_text = docx_question_level(p)
+            if not question_level or question_level > 6: # not a question
+                last_answer = f'{last_answer}\n{p_text}'
+                current_image = self.get_picture(self.doc, p)
+                last_image = concat_img(last_image, current_image)
+            else:   # is a question
+                if last_answer or last_image:
+                    sum_question = '\n'.join(question_stack)
+                    if sum_question:
+                        qai_list.append((sum_question, last_answer, last_image))
+                    last_answer, last_image = '', None
+
+                i = question_level
+                while question_stack and i <= level_stack[-1]:
+                    question_stack.pop()
+                    level_stack.pop()
+                question_stack.append(p_text)
+                level_stack.append(question_level)
+            for run in p.runs:
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+                    continue
+                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
+                    pn += 1
+        if last_answer:
+            sum_question = '\n'.join(question_stack)
+            if sum_question:
+                qai_list.append((sum_question, last_answer, last_image))
+
+        tbls = []
+        for tb in self.doc.tables:
+            html= "<table>"
+            for r in tb.rows:
+                html += "<tr>"
+                i = 0
+                while i < len(r.cells):
+                    span = 1
+                    c = r.cells[i]
+                    for j in range(i+1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            i = j
+                    i += 1
+                    html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
+                html += "</tr>"
+            html += "</table>"
+            tbls.append(((None, html), ""))
+        return qai_list, tbls
+
+
+def rmPrefix(txt):
+    return re.sub(
+        r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)
+
+
+def beAdocPdf(d, q, a, eng, image, poss):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if image:
+        d["image"] = image
+        d["doc_type_kwd"] = "image"
+    add_positions(d, poss)
+    return d
+
+
+def beAdocDocx(d, q, a, eng, image, row_num=-1):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if image:
+        d["image"] = image
+        d["doc_type_kwd"] = "image"
+    if row_num >= 0:
+        d["top_int"] = [row_num]
+    return d
+
+
+def beAdoc(d, q, a, eng, row_num=-1):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if row_num >= 0:
+        d["top_int"] = [row_num]
+    return d
+
+
+def mdQuestionLevel(s):
+    match = re.match(r'#*', s)
+    return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+    """
+        Excel and csv(txt) format files are supported.
+        If the file is in excel format, there should be 2 column question and answer without header.
+        And question column is ahead of answer column.
+        And it's O.K if it has multiple sheets as long as the columns are rightly composed.
+
+        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
+
+        All the deformed lines will be ignored.
+        Every pair of Q&A will be treated as a chunk.
+    """
+    eng = lang.lower() == "english"
+    res = []
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = Excel()
+        for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
+            res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
+        return res
+
+    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        comma, tab = 0, 0
+        for line in lines:
+            if len(line.split(",")) == 2:
+                comma += 1
+            if len(line.split("\t")) == 2:
+                tab += 1
+        delimiter = "\t" if tab >= comma else ","
+
+        fails = []
+        question, answer = "", ""
+        i = 0
+        while i < len(lines):
+            arr = lines[i].split(delimiter)
+            if len(arr) != 2:
+                if question:
+                    answer += "\n" + lines[i]
+                else:
+                    fails.append(str(i+1))
+            elif len(arr) == 2:
+                if question and answer:
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
+                question, answer = arr
+            i += 1
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        if question:
+            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))
+
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        return res
+
+    elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        delimiter = "\t" if any("\t" in line for line in lines) else ","
+
+        fails = []
+        question, answer = "", ""
+        res = []
+        reader = csv.reader(lines, delimiter=delimiter)
+
+        for i, row in enumerate(reader):
+            if len(row) != 2:
+                if question:
+                    answer += "\n" + lines[i]
+                else:
+                    fails.append(str(i + 1))
+            elif len(row) == 2:
+                if question and answer:
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
+                question, answer = row
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        if question:
+            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(list(reader))))
+
+        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        return res
+
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        pdf_parser = Pdf()
+        qai_list, tbls = pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)
+        for q, a, image, poss in qai_list:
+            res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
+        return res
+
+    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        _last_question, last_answer = "", ""
+        question_stack, level_stack = [], []
+        code_block = False
+        for index, line in enumerate(lines):
+            if line.strip().startswith('```'):
+                code_block = not code_block
+            question_level, question = 0, ''
+            if not code_block:
+                question_level, question = mdQuestionLevel(line)
+
+            if not question_level or question_level > 6: # not a question
+                last_answer = f'{last_answer}\n{line}'
+            else:   # is a question
+                if last_answer.strip():
+                    sum_question = '\n'.join(question_stack)
+                    if sum_question:
+                        res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
+                    last_answer = ''
+
+                i = question_level
+                while question_stack and i <= level_stack[-1]:
+                    question_stack.pop()
+                    level_stack.pop()
+                question_stack.append(question)
+                level_stack.append(question_level)
+        if last_answer.strip():
+            sum_question = '\n'.join(question_stack)
+            if sum_question:
+                res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
+        return res
+
+    elif re.search(r"\.docx$", filename, re.IGNORECASE):
+        docx_parser = Docx()
+        qai_list, tbls = docx_parser(filename, binary,
+                                    from_page=0, to_page=10000, callback=callback)
+        res = tokenize_table(tbls, doc, eng)
+        for i, (q, a, image) in enumerate(qai_list):
+            res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
+        return res
+
+    raise NotImplementedError(
+        "Excel, csv(txt), pdf, markdown and docx format files are supported.")
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@@ -0,0 +1,176 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import base64
+import datetime
+import json
+import re
+import pandas as pd
+import requests
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from rag.nlp import rag_tokenizer
+from deepdoc.parser.resume import refactor
+from deepdoc.parser.resume import step_one, step_two
+from rag.utils import rmSpace
+
+forbidden_select_fields4resume = [
+    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
+]
+
+
+def remote_call(filename, binary):
+    q = {
+        "header": {
+            "uid": 1,
+            "user": "kevinhu",
+            "log_id": filename
+        },
+        "request": {
+            "p": {
+                "request_id": "1",
+                "encrypt_type": "base64",
+                "filename": filename,
+                "langtype": '',
+                "fileori": base64.b64encode(binary).decode('utf-8')
+            },
+            "c": "resume_parse_module",
+            "m": "resume_parse"
+        }
+    }
+    for _ in range(3):
+        try:
+            resume = requests.post(
+                "http://127.0.0.1:61670/tog",
+                data=json.dumps(q))
+            resume = resume.json()["response"]["results"]
+            resume = refactor(resume)
+            for k in ["education", "work", "project",
+                      "training", "skill", "certificate", "language"]:
+                if not resume.get(k) and k in resume:
+                    del resume[k]
+
+            resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
+                                                      "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
+            resume = step_two.parse(resume)
+            return resume
+        except Exception:
+            logging.exception("Resume parser has not been supported yet!")
+    return {}
+
+
+def chunk(filename, binary=None, callback=None, **kwargs):
+    """
+    The supported file formats are pdf, docx and txt.
+    To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow
+    """
+    if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
+        raise NotImplementedError("file type not supported yet(pdf supported)")
+
+    if not binary:
+        with open(filename, "rb") as f:
+            binary = f.read()
+
+    callback(0.2, "Resume parsing is going on...")
+    resume = remote_call(filename, binary)
+    if len(resume.keys()) < 7:
+        callback(-1, "Resume is not successfully parsed.")
+        raise Exception("Resume parser remote call fail!")
+    callback(0.6, "Done parsing. Chunking...")
+    logging.debug("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
+
+    field_map = {
+        "name_kwd": "姓名/名字",
+        "name_pinyin_kwd": "姓名拼音/名字拼音",
+        "gender_kwd": "性别（男，女）",
+        "age_int": "年龄/岁/年纪",
+        "phone_kwd": "电话/手机/微信",
+        "email_tks": "email/e-mail/邮箱",
+        "position_name_tks": "职位/职能/岗位/职责",
+        "expect_city_names_tks": "期望城市",
+        "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
+        "corporation_name_tks": "最近就职(上班)的公司/上一家公司",
+
+        "first_school_name_tks": "第一学历毕业学校",
+        "first_degree_kwd": "第一学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
+        "highest_degree_kwd": "最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
+        "first_major_tks": "第一学历专业",
+        "edu_first_fea_kwd": "第一学历标签（211，留学，双一流，985，海外知名，重点大学，中专，专升本，专科，本科，大专）",
+
+        "degree_kwd": "过往学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）",
+        "major_tks": "学过的专业/过往专业",
+        "school_name_tks": "学校/毕业院校",
+        "sch_rank_kwd": "学校标签（顶尖学校，精英学校，优质学校，一般学校）",
+        "edu_fea_kwd": "教育标签（211，留学，双一流，985，海外知名，重点大学，中专，专升本，专科，本科，大专）",
+
+        "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
+        "edu_end_int": "毕业年份",
+        "industry_name_tks": "所在行业",
+
+        "birth_dt": "生日/出生年份",
+        "expect_position_name_tks": "期望职位/期望职能/期望岗位",
+    }
+
+    titles = []
+    for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
+        v = resume.get(n, "")
+        if isinstance(v, list):
+            v = v[0]
+        if n.find("tks") > 0:
+            v = rmSpace(v)
+        titles.append(str(v))
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+    pairs = []
+    for n, m in field_map.items():
+        if not resume.get(n):
+            continue
+        v = resume[n]
+        if isinstance(v, list):
+            v = " ".join(v)
+        if n.find("tks") > 0:
+            v = rmSpace(v)
+        pairs.append((m, str(v)))
+
+    doc["content_with_weight"] = "\n".join(
+        ["{}: {}".format(re.sub(r"（[^（）]+）", "", k), v) for k, v in pairs])
+    doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
+    doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
+    for n, _ in field_map.items():
+        if n not in resume:
+            continue
+        if isinstance(resume[n], list) and (
+                len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
+            resume[n] = resume[n][0]
+        if n.find("_tks") > 0:
+            resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
+        doc[n] = resume[n]
+
+    logging.debug("chunked resume to " + str(doc))
+    KnowledgebaseService.update_parser_config(
+        kwargs["kb_id"], {"field_map": field_map})
+    return [doc]
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(a, b):
+        pass
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -0,0 +1,402 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import copy
+import re
+from io import BytesIO
+from xpinyin import Pinyin
+import numpy as np
+import pandas as pd
+from collections import Counter
+
+# from openpyxl import load_workbook, Workbook
+from dateutil.parser import parse as datetime_parse
+
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from deepdoc.parser.utils import get_text
+from rag.nlp import rag_tokenizer, tokenize
+from deepdoc.parser import ExcelParser
+
+
+class Excel(ExcelParser):
+    def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
+        if not binary:
+            wb = Excel._load_excel_to_workbook(fnm)
+        else:
+            wb = Excel._load_excel_to_workbook(BytesIO(binary))
+        total = 0
+        for sheetname in wb.sheetnames:
+            total += len(list(wb[sheetname].rows))
+        res, fails, done = [], [], 0
+        rn = 0
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            if not rows:
+                continue
+            headers, header_rows = self._parse_headers(ws, rows)
+            if not headers:
+                continue
+            data = []
+            for i, r in enumerate(rows[header_rows:]):
+                rn += 1
+                if rn - 1 < from_page:
+                    continue
+                if rn - 1 >= to_page:
+                    break
+                row_data = self._extract_row_data(ws, r, header_rows + i, len(headers))
+                if row_data is None:
+                    fails.append(str(i))
+                    continue
+                if self._is_empty_row(row_data):
+                    continue
+                data.append(row_data)
+                done += 1
+            if len(data) == 0:
+                continue
+            df = pd.DataFrame(data, columns=headers)
+            res.append(df)
+        callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        return res
+
+    def _parse_headers(self, ws, rows):
+        if len(rows) == 0:
+            return [], 0
+        has_complex_structure = self._has_complex_header_structure(ws, rows)
+        if has_complex_structure:
+            return self._parse_multi_level_headers(ws, rows)
+        else:
+            return self._parse_simple_headers(rows)
+
+    def _has_complex_header_structure(self, ws, rows):
+        if len(rows) < 1:
+            return False
+        merged_ranges = list(ws.merged_cells.ranges)
+        # 检查前两行是否涉及合并单元格
+        for rng in merged_ranges:
+            if rng.min_row <= 2:  # 只要合并区域涉及第1或第2行
+                return True
+        return False
+
+    def _row_looks_like_header(self, row):
+        header_like_cells = 0
+        data_like_cells = 0
+        non_empty_cells = 0
+        for cell in row:
+            if cell.value is not None:
+                non_empty_cells += 1
+                val = str(cell.value).strip()
+                if self._looks_like_header(val):
+                    header_like_cells += 1
+                elif self._looks_like_data(val):
+                    data_like_cells += 1
+        if non_empty_cells == 0:
+            return False
+        return header_like_cells >= data_like_cells
+
+    def _parse_simple_headers(self, rows):
+        if not rows:
+            return [], 0
+        header_row = rows[0]
+        headers = []
+        for cell in header_row:
+            if cell.value is not None:
+                header_value = str(cell.value).strip()
+                if header_value:
+                    headers.append(header_value)
+            else:
+                pass
+        final_headers = []
+        for i, cell in enumerate(header_row):
+            if cell.value is not None:
+                header_value = str(cell.value).strip()
+                if header_value:
+                    final_headers.append(header_value)
+                else:
+                    final_headers.append(f"Column_{i + 1}")
+            else:
+                final_headers.append(f"Column_{i + 1}")
+        return final_headers, 1
+
+    def _parse_multi_level_headers(self, ws, rows):
+        if len(rows) < 2:
+            return [], 0
+        header_rows = self._detect_header_rows(rows)
+        if header_rows == 1:
+            return self._parse_simple_headers(rows)
+        else:
+            return self._build_hierarchical_headers(ws, rows, header_rows), header_rows
+
+    def _detect_header_rows(self, rows):
+        if len(rows) < 2:
+            return 1
+        header_rows = 1
+        max_check_rows = min(5, len(rows))
+        for i in range(1, max_check_rows):
+            row = rows[i]
+            if self._row_looks_like_header(row):
+                header_rows = i + 1
+            else:
+                break
+        return header_rows
+
+    def _looks_like_header(self, value):
+        if len(value) < 1:
+            return False
+        if any(ord(c) > 127 for c in value):
+            return True
+        if len([c for c in value if c.isalpha()]) >= 2:
+            return True
+        if any(c in value for c in ["(", ")", "：", ":", "（", "）", "_", "-"]):
+            return True
+        return False
+
+    def _looks_like_data(self, value):
+        if len(value) == 1 and value.upper() in ["Y", "N", "M", "X", "/", "-"]:
+            return True
+        if value.replace(".", "").replace("-", "").replace(",", "").isdigit():
+            return True
+        if value.startswith("0x") and len(value) <= 10:
+            return True
+        return False
+
+    def _build_hierarchical_headers(self, ws, rows, header_rows):
+        headers = []
+        max_col = max(len(row) for row in rows[:header_rows]) if header_rows > 0 else 0
+        merged_ranges = list(ws.merged_cells.ranges)
+        for col_idx in range(max_col):
+            header_parts = []
+            for row_idx in range(header_rows):
+                if col_idx < len(rows[row_idx]):
+                    cell_value = rows[row_idx][col_idx].value
+                    merged_value = self._get_merged_cell_value(ws, row_idx + 1, col_idx + 1, merged_ranges)
+                    if merged_value is not None:
+                        cell_value = merged_value
+                    if cell_value is not None:
+                        cell_value = str(cell_value).strip()
+                        if cell_value and cell_value not in header_parts and self._is_valid_header_part(cell_value):
+                            header_parts.append(cell_value)
+            if header_parts:
+                header = "-".join(header_parts)
+                headers.append(header)
+            else:
+                headers.append(f"Column_{col_idx + 1}")
+        final_headers = [h for h in headers if h and h != "-"]
+        return final_headers
+
+    def _is_valid_header_part(self, value):
+        if len(value) == 1 and value.upper() in ["Y", "N", "M", "X"]:
+            return False
+        if value.replace(".", "").replace("-", "").replace(",", "").isdigit():
+            return False
+        if value in ["/", "-", "+", "*", "="]:
+            return False
+        return True
+
+    def _get_merged_cell_value(self, ws, row, col, merged_ranges):
+        for merged_range in merged_ranges:
+            if merged_range.min_row <= row <= merged_range.max_row and merged_range.min_col <= col <= merged_range.max_col:
+                return ws.cell(merged_range.min_row, merged_range.min_col).value
+        return None
+
+    def _extract_row_data(self, ws, row, absolute_row_idx, expected_cols):
+        row_data = []
+        merged_ranges = list(ws.merged_cells.ranges)
+        actual_row_num = absolute_row_idx + 1
+        for col_idx in range(expected_cols):
+            cell_value = None
+            actual_col_num = col_idx + 1
+            try:
+                cell_value = ws.cell(row=actual_row_num, column=actual_col_num).value
+            except ValueError:
+                if col_idx < len(row):
+                    cell_value = row[col_idx].value
+            if cell_value is None:
+                merged_value = self._get_merged_cell_value(ws, actual_row_num, actual_col_num, merged_ranges)
+                if merged_value is not None:
+                    cell_value = merged_value
+                else:
+                    cell_value = self._get_inherited_value(ws, actual_row_num, actual_col_num, merged_ranges)
+            row_data.append(cell_value)
+        return row_data
+
+    def _get_inherited_value(self, ws, row, col, merged_ranges):
+        for merged_range in merged_ranges:
+            if merged_range.min_row <= row <= merged_range.max_row and merged_range.min_col <= col <= merged_range.max_col:
+                return ws.cell(merged_range.min_row, merged_range.min_col).value
+        return None
+
+    def _is_empty_row(self, row_data):
+        for val in row_data:
+            if val is not None and str(val).strip() != "":
+                return False
+        return True
+
+
+def trans_datatime(s):
+    try:
+        return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
+    except Exception:
+        pass
+
+
+def trans_bool(s):
+    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
+        return "yes"
+    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
+        return "no"
+
+
+def column_data_type(arr):
+    arr = list(arr)
+    counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
+    trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
+    float_flag = False
+    for a in arr:
+        if a is None:
+            continue
+        if re.match(r"[+-]?[0-9]+$", str(a).replace("%%", "")) and not str(a).replace("%%", "").startswith("0"):
+            counts["int"] += 1
+            if int(str(a)) > 2**63 - 1:
+                float_flag = True
+                break
+        elif re.match(r"[+-]?[0-9.]{,19}$", str(a).replace("%%", "")) and not str(a).replace("%%", "").startswith("0"):
+            counts["float"] += 1
+        elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
+            counts["bool"] += 1
+        elif trans_datatime(str(a)):
+            counts["datetime"] += 1
+        else:
+            counts["text"] += 1
+    if float_flag:
+        ty = "float"
+    else:
+        counts = sorted(counts.items(), key=lambda x: x[1] * -1)
+        ty = counts[0][0]
+    for i in range(len(arr)):
+        if arr[i] is None:
+            continue
+        try:
+            arr[i] = trans[ty](str(arr[i]))
+        except Exception:
+            arr[i] = None
+    # if ty == "text":
+    #    if len(arr) > 128 and uni / len(arr) < 0.1:
+    #        ty = "keyword"
+    return arr, ty
+
+
+def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
+    """
+    Excel and csv(txt) format files are supported.
+    For csv or txt file, the delimiter between columns is TAB.
+    The first line must be column headers.
+    Column headers must be meaningful terms inorder to make our NLP model understanding.
+    It's good to enumerate some synonyms using slash '/' to separate, and even better to
+    enumerate values using brackets like 'gender/sex(male, female)'.
+    Here are some examples for headers:
+        1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
+        2. 姓名/名字\t电话/手机/微信\t最高学历（高中，职高，硕士，本科，博士，初中，中技，中专，专科，专升本，MPA，MBA，EMBA）
+
+    Every row in table will be treated as a chunk.
+    """
+
+    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = Excel()
+        dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
+    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        fails = []
+        headers = lines[0].split(kwargs.get("delimiter", "\t"))
+        rows = []
+        for i, line in enumerate(lines[1:]):
+            if i < from_page:
+                continue
+            if i >= to_page:
+                break
+            row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
+            if len(row) != len(headers):
+                fails.append(str(i))
+                continue
+            rows.append(row)
+
+        callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        dfs = [pd.DataFrame(np.array(rows), columns=headers)]
+
+    else:
+        raise NotImplementedError("file type not supported yet(excel, text, csv supported)")
+
+    res = []
+    PY = Pinyin()
+    fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
+    for df in dfs:
+        for n in ["id", "_id", "index", "idx"]:
+            if n in df.columns:
+                del df[n]
+        clmns = df.columns.values
+        if len(clmns) != len(set(clmns)):
+            col_counts = Counter(clmns)
+            duplicates = [col for col, count in col_counts.items() if count > 1]
+            if duplicates:
+                raise ValueError(f"Duplicate column names detected: {duplicates}\nFrom: {clmns}")
+
+        txts = list(copy.deepcopy(clmns))
+        py_clmns = [PY.get_pinyins(re.sub(r"(/.*|（[^（）]+?）|\([^()]+?\))", "", str(n)), "_")[0] for n in clmns]
+        clmn_tys = []
+        for j in range(len(clmns)):
+            cln, ty = column_data_type(df[clmns[j]])
+            clmn_tys.append(ty)
+            df[clmns[j]] = cln
+            if ty == "text":
+                txts.extend([str(c) for c in cln if c])
+        clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))]
+
+        eng = lang.lower() == "english"  # is_english(txts)
+        for ii, row in df.iterrows():
+            d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
+            row_txt = []
+            for j in range(len(clmns)):
+                if row[clmns[j]] is None:
+                    continue
+                if not str(row[clmns[j]]):
+                    continue
+                if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
+                    continue
+                fld = clmns_map[j][0]
+                d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
+                row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
+            if not row_txt:
+                continue
+            tokenize(d, "; ".join(row_txt), eng)
+            res.append(d)
+
+        KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
+    callback(0.35, "")
+
+    return res
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+
+    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/tag.py
+++ b/rag/app/tag.py
@@ -0,0 +1,157 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+import re
+import csv
+from copy import deepcopy
+
+from deepdoc.parser.utils import get_text
+from rag.app.qa import Excel
+from rag.nlp import rag_tokenizer
+
+
+def beAdoc(d, q, a, eng, row_num=-1):
+    d["content_with_weight"] = q
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    d["tag_kwd"] = [t.strip().replace(".", "_") for t in a.split(",") if t.strip()]
+    if row_num >= 0:
+        d["top_int"] = [row_num]
+    return d
+
+
+def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
+    """
+        Excel and csv(txt) format files are supported.
+        If the file is in excel format, there should be 2 column content and tags without header.
+        And content column is ahead of tags column.
+        And it's O.K if it has multiple sheets as long as the columns are rightly composed.
+
+        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate content and tags.
+
+        All the deformed lines will be ignored.
+        Every pair will be treated as a chunk.
+    """
+    eng = lang.lower() == "english"
+    res = []
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = Excel()
+        for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
+            res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
+        return res
+
+    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        comma, tab = 0, 0
+        for line in lines:
+            if len(line.split(",")) == 2:
+                comma += 1
+            if len(line.split("\t")) == 2:
+                tab += 1
+        delimiter = "\t" if tab >= comma else ","
+
+        fails = []
+        content = ""
+        i = 0
+        while i < len(lines):
+            arr = lines[i].split(delimiter)
+            if len(arr) != 2:
+                content += "\n" + lines[i]
+            elif len(arr) == 2:
+                content += "\n" + arr[0]
+                res.append(beAdoc(deepcopy(doc), content, arr[1], eng, i))
+                content = ""
+            i += 1
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract TAG: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        callback(0.6, ("Extract TAG: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        return res
+
+    elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+
+        fails = []
+        content = ""
+        res = []
+        reader = csv.reader(lines)
+
+        for i, row in enumerate(reader):
+            row = [r.strip() for r in row if r.strip()]
+            if len(row) != 2:
+                content += "\n" + lines[i]
+            elif len(row) == 2:
+                content += "\n" + row[0]
+                res.append(beAdoc(deepcopy(doc), content, row[1], eng, i))
+                content = ""
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Tags: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        callback(0.6, ("Extract TAG : {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        return res
+
+    raise NotImplementedError(
+        "Excel, csv(txt) format files are supported.")
+
+
+def label_question(question, kbs):
+    from api.db.services.knowledgebase_service import KnowledgebaseService
+    from graphrag.utils import get_tags_from_cache, set_tags_to_cache
+    from api import settings
+    tags = None
+    tag_kb_ids = []
+    for kb in kbs:
+        if kb.parser_config.get("tag_kb_ids"):
+            tag_kb_ids.extend(kb.parser_config["tag_kb_ids"])
+    if tag_kb_ids:
+        all_tags = get_tags_from_cache(tag_kb_ids)
+        if not all_tags:
+            all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids)
+            set_tags_to_cache(tags=all_tags, kb_ids=tag_kb_ids)
+        else:
+            all_tags = json.loads(all_tags)
+        tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids)
+        if not tag_kbs:
+            return tags
+        tags = settings.retrievaler.tag_query(question,
+                                              list(set([kb.tenant_id for kb in tag_kbs])),
+                                              tag_kb_ids,
+                                              all_tags,
+                                              kb.parser_config.get("topn_tags", 3)
+                                              )
+    return tags
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)