将flask改成fastapi

2025-10-13 13:18:03 +08:00
commit 88db2539b0
476 changed files with 739741 additions and 0 deletions
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
@@ -0,0 +1,139 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from docx import Document
+import re
+import pandas as pd
+from collections import Counter
+from rag.nlp import rag_tokenizer
+from io import BytesIO
+
+
+class RAGFlowDocxParser:
+
+    def __extract_table_content(self, tb):
+        df = []
+        for row in tb.rows:
+            df.append([c.text for c in row.cells])
+        return self.__compose_table_content(pd.DataFrame(df))
+
+    def __compose_table_content(self, df):
+
+        def blockType(b):
+            pattern = [
+                ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
+                (r"^(20|19)[0-9]{2}年$", "Dt"),
+                (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
+                ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
+                (r"^第*[一二三四1-4]季度$", "Dt"),
+                (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
+                (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
+                ("^[0-9.,+%/ -]+$", "Nu"),
+                (r"^[0-9A-Z/\._~-]+$", "Ca"),
+                (r"^[A-Z]*[a-z' -]+$", "En"),
+                (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
+                (r"^.{1}$", "Sg")
+            ]
+            for p, n in pattern:
+                if re.search(p, b):
+                    return n
+            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
+            if len(tks) > 3:
+                if len(tks) < 12:
+                    return "Tx"
+                else:
+                    return "Lx"
+
+            if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
+                return "Nr"
+
+            return "Ot"
+
+        if len(df) < 2:
+            return []
+        max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
+            1, len(df)) for j in range(len(df.iloc[i, :]))])
+        max_type = max(max_type.items(), key=lambda x: x[1])[0]
+
+        colnm = len(df.iloc[0, :])
+        hdrows = [0]  # header is not necessarily appear in the first line
+        if max_type == "Nu":
+            for r in range(1, len(df)):
+                tys = Counter([blockType(str(df.iloc[r, j]))
+                              for j in range(len(df.iloc[r, :]))])
+                tys = max(tys.items(), key=lambda x: x[1])[0]
+                if tys != max_type:
+                    hdrows.append(r)
+
+        lines = []
+        for i in range(1, len(df)):
+            if i in hdrows:
+                continue
+            hr = [r - i for r in hdrows]
+            hr = [r for r in hr if r < 0]
+            t = len(hr) - 1
+            while t > 0:
+                if hr[t] - hr[t - 1] > 1:
+                    hr = hr[t:]
+                    break
+                t -= 1
+            headers = []
+            for j in range(len(df.iloc[i, :])):
+                t = []
+                for h in hr:
+                    x = str(df.iloc[i + h, j]).strip()
+                    if x in t:
+                        continue
+                    t.append(x)
+                t = ",".join(t)
+                if t:
+                    t += ": "
+                headers.append(t)
+            cells = []
+            for j in range(len(df.iloc[i, :])):
+                if not str(df.iloc[i, j]):
+                    continue
+                cells.append(headers[j] + str(df.iloc[i, j]))
+            lines.append(";".join(cells))
+
+        if colnm > 3:
+            return lines
+        return ["\n".join(lines)]
+
+    def __call__(self, fnm, from_page=0, to_page=100000000):
+        self.doc = Document(fnm) if isinstance(
+            fnm, str) else Document(BytesIO(fnm))
+        pn = 0 # parsed page
+        secs = [] # parsed contents
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+
+            runs_within_single_paragraph = [] # save runs within the range of pages
+            for run in p.runs:
+                if pn > to_page:
+                    break
+                if from_page <= pn < to_page and p.text.strip():
+                    runs_within_single_paragraph.append(run.text) # append run.text first
+
+                # wrap page break checker into a static method
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+
+            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
+
+        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
+        return secs, tbls