v0.21.1-fastapi

2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -20,11 +20,14 @@ import re
 from io import BytesIO

 from deepdoc.parser.utils import get_text
+from rag.app import naive
 from rag.nlp import bullets_category, is_english,remove_contents_table, \
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
    tokenize_chunks
 from rag.nlp import rag_tokenizer
-from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
+from deepdoc.parser import PdfParser, PlainParser, HtmlParser
+from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
+from PIL import Image


 class Pdf(PdfParser):
@@ -81,13 +84,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    sections, tbls = [], []
    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        doc_parser = DocxParser()
+        doc_parser = naive.Docx()
        # TODO: table of contents need to be removed
        sections, tbls = doc_parser(
-            binary if binary else filename, from_page=from_page, to_page=to_page)
+            filename, binary=binary, from_page=from_page, to_page=to_page)
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
-        tbls = [((None, lns), None) for lns in tbls]
+        tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
+        # tbls = [((None, lns), None) for lns in tbls]
+        sections=[(item[0],item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@@ -96,6 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            pdf_parser = PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)
+        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)

    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -23,6 +23,7 @@ from io import BytesIO
 from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
 from rag.utils import num_tokens_from_string
 from deepdoc.parser import PdfParser, PlainParser, DocxParser
+from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
 from docx import Document
 from PIL import Image

@@ -252,7 +253,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            tk_cnt = num_tokens_from_string(txt)
            if sec_id > -1:
                last_sid = sec_id
-
+        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
        res = tokenize_table(tbls, doc, eng)
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
        return res
@@ -261,6 +262,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        docx_parser = Docx()
        ti_list, tbls = docx_parser(filename, binary,
                                    from_page=0, to_page=10000, callback=callback)
+        tbls=vision_figure_parser_docx_wrapper(sections=ti_list,tbls=tbls,callback=callback,**kwargs)
        res = tokenize_table(tbls, doc, eng)
        for text, image in ti_list:
            d = copy.deepcopy(doc)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -16,10 +16,10 @@

 import logging
 import re
+import os
 from functools import reduce
 from io import BytesIO
 from timeit import default_timer as timer
-
 from docx import Document
 from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
@@ -30,9 +30,11 @@ from tika import parser

 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
+from api.utils.file_utils import extract_embed_file
 from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
-from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
+from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
+from deepdoc.parser.mineru_parser import MinerUParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table


@@ -256,6 +258,49 @@ class Docx(DocxParser):
            tbls.append(((None, html), ""))
        return new_line, tbls

+    def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
+        """
+        This function uses mammoth, licensed under the BSD 2-Clause License.
+        """
+
+        import base64
+        import uuid
+
+        import mammoth
+        from markdownify import markdownify
+
+        docx_file = BytesIO(binary) if binary else open(filename, "rb")
+
+        def _convert_image_to_base64(image):
+            try:
+                with image.open() as image_file:
+                    image_bytes = image_file.read()
+                encoded = base64.b64encode(image_bytes).decode("utf-8")
+                base64_url = f"data:{image.content_type};base64,{encoded}"
+
+                alt_name = "image"
+                alt_name = f"img_{uuid.uuid4().hex[:8]}"
+
+                return {"src": base64_url, "alt": alt_name}
+            except Exception as e:
+                logging.warning(f"Failed to convert image to base64: {e}")
+                return {"src": "", "alt": "image"}
+
+        try:
+            if inline_images:
+                result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
+            else:
+                result = mammoth.convert_to_html(docx_file)
+
+            html = result.value
+
+            markdown_text = markdownify(html)
+            return markdown_text
+
+        finally:
+            if not binary:
+                docx_file.close()
+

 class Pdf(PdfParser):
    def __init__(self):
@@ -285,7 +330,7 @@ class Pdf(PdfParser):
        callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))

        start = timer()
-        self._text_merge()
+        self._text_merge(zoomin=zoomin)
        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))

        if separate_tables_figures:
@@ -297,6 +342,7 @@ class Pdf(PdfParser):
            tbls = self._extract_table_figure(True, zoomin, True, True)
            self._naive_vertical_merge()
            self._concat_downward()
+            self._final_reading_order_merge()
            # self._filter_forpages()
            logging.info("layouts cost: {}s".format(timer() - first_start))
            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
@@ -391,6 +437,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        Successive text will be sliced into pieces using 'delimiter'.
        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
    """
+    

    is_english = lang.lower() == "english"  # is_english(cks)
    parser_config = kwargs.get(
@@ -404,27 +451,37 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    res = []
    pdf_parser = None
    section_images = None
+
+    is_root = kwargs.get("is_root", True)
+    embed_res = []
+    if is_root:
+        # Only extract embedded files at the root call
+        embeds = []
+        if binary is not None:
+            embeds = extract_embed_file(binary)
+        else:
+            raise Exception("Embedding extraction from file path is not supported.")
+        
+        # Recursively chunk each embedded file and collect results
+        for embed_filename, embed_bytes in embeds:
+            try:
+                sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
+                embed_res.extend(sub_res)
+            except Exception as e:
+                if callback:
+                    callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
+                continue
+
    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")

-        try:
-            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
-            callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
-        except Exception:
-            vision_model = None
+        

        # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
        _SerializedRelationships.load_from_xml = load_from_xml_v2
        sections, tables = Docx()(filename, binary)

-        if vision_model:
-            figures_data = vision_figure_parser_figure_data_wrapper(sections)
-            try:
-                docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
-                boosted_figures = docx_vision_parser(callback=callback)
-                tables.extend(boosted_figures)
-            except Exception as e:
-                callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
+        tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)

        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")
@@ -437,10 +494,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                "delimiter", "\n!?。；！？"))

        if kwargs.get("section_only", False):
+            chunks.extend(embed_res)
            return chunks

        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
        logging.info("naive_merge({}): {}".format(filename, timer() - st))
+        res.extend(embed_res)
        return res

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@@ -451,29 +510,28 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        if layout_recognizer == "DeepDOC":
            pdf_parser = Pdf()
-
-            try:
-                vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
-                callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
-            except Exception:
-                vision_model = None
-
-            if vision_model:
-                sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
-                callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
-                try:
-                    pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
-                    boosted_figures = pdf_vision_parser(callback=callback)
-                    tables.extend(boosted_figures)
-                except Exception as e:
-                    callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
-                    tables.extend(figures)
-            else:
-                sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
+            sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
+            tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)

            res = tokenize_table(tables, doc, is_english)
            callback(0.8, "Finish parsing.")

+        elif layout_recognizer == "MinerU":
+            mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
+            pdf_parser = MinerUParser(mineru_path=mineru_executable)
+            if not pdf_parser.check_installation():
+                callback(-1, "MinerU not found.")
+                return res
+
+            sections, tables = pdf_parser.parse_pdf(
+                filepath=filename,
+                binary=binary,
+                callback=callback,
+                output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
+                delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+            )
+            parser_config["chunk_token_num"] = 0
+            callback(0.8, "Finish parsing.")
        else:
            if layout_recognizer == "Plain Text":
                pdf_parser = PlainParser()
@@ -512,7 +570,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
        except Exception:
            vision_model = None
-        
+
        if vision_model:
            # Process images for each section
            section_images = []
@@ -560,7 +618,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            callback(0.8, f"tika.parser got empty content from {filename}.")
            logging.warning(f"tika.parser got empty content from {filename}.")
            return []
-
    else:
        raise NotImplementedError(
            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
@@ -577,6 +634,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                                            "chunk_token_num", 128)), parser_config.get(
                                            "delimiter", "\n!?。；！？"))
        if kwargs.get("section_only", False):
+            chunks.extend(embed_res)
            return chunks

        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
@@ -586,11 +644,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                "chunk_token_num", 128)), parser_config.get(
                "delimiter", "\n!?。；！？"))
        if kwargs.get("section_only", False):
+            chunks.extend(embed_res)
            return chunks

        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))

    logging.info("naive_merge({}): {}".format(filename, timer() - st))
+    if embed_res:
+        res.extend(embed_res)
    return res


--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -23,6 +23,7 @@ from deepdoc.parser.utils import get_text
 from rag.app import naive
 from rag.nlp import rag_tokenizer, tokenize
 from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
+from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper


 class Pdf(PdfParser):
@@ -57,13 +58,8 @@ class Pdf(PdfParser):

        sections = [(b["text"], self.get_position(b, zoomin))
                    for i, b in enumerate(self.boxes)]
-        for (img, rows), poss in tbls:
-            if not rows:
-                continue
-            sections.append((rows if isinstance(rows, str) else rows[0],
-                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
-            x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
+            x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls


 def chunk(filename, binary=None, from_page=0, to_page=100000,
@@ -80,6 +76,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections, tbls = naive.Docx()(filename, binary)
+        tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
        sections = [s for s, _ in sections if s]
        for (_, html), _ in tbls:
            sections.append(html)
@@ -89,8 +86,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        pdf_parser = Pdf()
        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
            pdf_parser = PlainParser()
-        sections, _ = pdf_parser(
+        sections, tbls = pdf_parser(
            filename if not binary else binary, to_page=to_page, callback=callback)
+        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
+        for (img, rows), poss in tbls:
+            if not rows:
+                continue
+            sections.append((rows if isinstance(rows, str) else rows[0],
+                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
        sections = [s for s, _ in sections if s]

    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -18,12 +18,12 @@ import logging
 import copy
 import re

+from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
 from api.db import ParserType
 from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
 from deepdoc.parser import PdfParser, PlainParser
 import numpy as np

-
 class Pdf(PdfParser):
    def __init__(self):
        self.model_speciess = ParserType.PAPER.value
@@ -160,6 +160,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            pdf_parser = Pdf()
            paper = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
+        tbls=paper["tables"]
+        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
+        paper["tables"] = tbls
    else:
        raise NotImplementedError("file type not supported yet(pdf supported)")

--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -23,44 +23,62 @@ from PIL import Image
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from deepdoc.vision import OCR
-from rag.nlp import tokenize
+from rag.nlp import rag_tokenizer, tokenize
 from rag.utils import clean_markdown_block
-from rag.nlp import rag_tokenizer
-

 ocr = OCR()

+# Gemini supported MIME types
+VIDEO_EXTS = [".mp4", ".mov", ".avi", ".flv", ".mpeg", ".mpg", ".webm", ".wmv", ".3gp", ".3gpp", ".mkv"]
+

 def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
-    img = Image.open(io.BytesIO(binary)).convert('RGB')
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
-        "image": img,
-        "doc_type_kwd": "image"
    }
-    bxs = ocr(np.array(img))
-    txt = "\n".join([t[0] for _, t in bxs if t[0]])
    eng = lang.lower() == "english"
-    callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
-    if (eng and len(txt.split()) > 32) or len(txt) > 32:
-        tokenize(doc, txt, eng)
-        callback(0.8, "OCR results is too long to use CV LLM.")
-        return [doc]

-    try:
-        callback(0.4, "Use CV LLM to describe the picture.")
-        cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
-        img_binary = io.BytesIO()
-        img.save(img_binary, format='JPEG')
-        img_binary.seek(0)
-        ans = cv_mdl.describe(img_binary.read())
-        callback(0.8, "CV LLM respond: %s ..." % ans[:32])
-        txt += "\n" + ans
-        tokenize(doc, txt, eng)
-        return [doc]
-    except Exception as e:
-        callback(prog=-1, msg=str(e))
+    if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
+        try:
+            doc.update({"doc_type_kwd": "video"})
+            cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
+            ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
+            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
+            ans += "\n" + ans
+            tokenize(doc, ans, eng)
+            return [doc]
+        except Exception as e:
+            callback(prog=-1, msg=str(e))
+    else:
+        img = Image.open(io.BytesIO(binary)).convert("RGB")
+        doc.update(
+            {
+                "image": img,
+                "doc_type_kwd": "image",
+            }
+        )
+        bxs = ocr(np.array(img))
+        txt = "\n".join([t[0] for _, t in bxs if t[0]])
+        callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
+        if (eng and len(txt.split()) > 32) or len(txt) > 32:
+            tokenize(doc, txt, eng)
+            callback(0.8, "OCR results is too long to use CV LLM.")
+            return [doc]
+
+        try:
+            callback(0.4, "Use CV LLM to describe the picture.")
+            cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
+            img_binary = io.BytesIO()
+            img.save(img_binary, format="JPEG")
+            img_binary.seek(0)
+            ans = cv_mdl.describe(img_binary.read())
+            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
+            txt += "\n" + ans
+            tokenize(doc, txt, eng)
+            return [doc]
+        except Exception as e:
+            callback(prog=-1, msg=str(e))

    return []

@@ -79,7 +97,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):

    try:
        with io.BytesIO() as img_binary:
-            img.save(img_binary, format='JPEG')
+            img.save(img_binary, format="JPEG")
            img_binary.seek(0)
            ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
            txt += "\n" + ans
--- a/rag/app/tag.py
+++ b/rag/app/tag.py
@@ -133,14 +133,14 @@ def label_question(question, kbs):
    if tag_kb_ids:
        all_tags = get_tags_from_cache(tag_kb_ids)
        if not all_tags:
-            all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids)
+            all_tags = settings.retriever.all_tags_in_portion(kb.tenant_id, tag_kb_ids)
            set_tags_to_cache(tags=all_tags, kb_ids=tag_kb_ids)
        else:
            all_tags = json.loads(all_tags)
        tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids)
        if not tag_kbs:
            return tags
-        tags = settings.retrievaler.tag_query(question,
+        tags = settings.retriever.tag_query(question,
                                              list(set([kb.tenant_id for kb in tag_kbs])),
                                              tag_kb_ids,
                                              all_tags,