将flask改成fastapi

2025-10-13 13:18:03 +08:00
commit 88db2539b0
476 changed files with 739741 additions and 0 deletions
--- a/deepdoc/parser/init.py
+++ b/deepdoc/parser/init.py
@@ -0,0 +1,40 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from .docx_parser import RAGFlowDocxParser as DocxParser
+from .excel_parser import RAGFlowExcelParser as ExcelParser
+from .html_parser import RAGFlowHtmlParser as HtmlParser
+from .json_parser import RAGFlowJsonParser as JsonParser
+from .markdown_parser import MarkdownElementExtractor
+from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
+from .pdf_parser import PlainParser
+from .pdf_parser import RAGFlowPdfParser as PdfParser
+from .ppt_parser import RAGFlowPptParser as PptParser
+from .txt_parser import RAGFlowTxtParser as TxtParser
+
+__all__ = [
+    "PdfParser",
+    "PlainParser",
+    "DocxParser",
+    "ExcelParser",
+    "PptParser",
+    "HtmlParser",
+    "JsonParser",
+    "MarkdownParser",
+    "TxtParser",
+    "MarkdownElementExtractor",
+]
+
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
@@ -0,0 +1,139 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from docx import Document
+import re
+import pandas as pd
+from collections import Counter
+from rag.nlp import rag_tokenizer
+from io import BytesIO
+
+
+class RAGFlowDocxParser:
+
+    def __extract_table_content(self, tb):
+        df = []
+        for row in tb.rows:
+            df.append([c.text for c in row.cells])
+        return self.__compose_table_content(pd.DataFrame(df))
+
+    def __compose_table_content(self, df):
+
+        def blockType(b):
+            pattern = [
+                ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
+                (r"^(20|19)[0-9]{2}年$", "Dt"),
+                (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
+                ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
+                (r"^第*[一二三四1-4]季度$", "Dt"),
+                (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
+                (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
+                ("^[0-9.,+%/ -]+$", "Nu"),
+                (r"^[0-9A-Z/\._~-]+$", "Ca"),
+                (r"^[A-Z]*[a-z' -]+$", "En"),
+                (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
+                (r"^.{1}$", "Sg")
+            ]
+            for p, n in pattern:
+                if re.search(p, b):
+                    return n
+            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
+            if len(tks) > 3:
+                if len(tks) < 12:
+                    return "Tx"
+                else:
+                    return "Lx"
+
+            if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
+                return "Nr"
+
+            return "Ot"
+
+        if len(df) < 2:
+            return []
+        max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
+            1, len(df)) for j in range(len(df.iloc[i, :]))])
+        max_type = max(max_type.items(), key=lambda x: x[1])[0]
+
+        colnm = len(df.iloc[0, :])
+        hdrows = [0]  # header is not necessarily appear in the first line
+        if max_type == "Nu":
+            for r in range(1, len(df)):
+                tys = Counter([blockType(str(df.iloc[r, j]))
+                              for j in range(len(df.iloc[r, :]))])
+                tys = max(tys.items(), key=lambda x: x[1])[0]
+                if tys != max_type:
+                    hdrows.append(r)
+
+        lines = []
+        for i in range(1, len(df)):
+            if i in hdrows:
+                continue
+            hr = [r - i for r in hdrows]
+            hr = [r for r in hr if r < 0]
+            t = len(hr) - 1
+            while t > 0:
+                if hr[t] - hr[t - 1] > 1:
+                    hr = hr[t:]
+                    break
+                t -= 1
+            headers = []
+            for j in range(len(df.iloc[i, :])):
+                t = []
+                for h in hr:
+                    x = str(df.iloc[i + h, j]).strip()
+                    if x in t:
+                        continue
+                    t.append(x)
+                t = ",".join(t)
+                if t:
+                    t += ": "
+                headers.append(t)
+            cells = []
+            for j in range(len(df.iloc[i, :])):
+                if not str(df.iloc[i, j]):
+                    continue
+                cells.append(headers[j] + str(df.iloc[i, j]))
+            lines.append(";".join(cells))
+
+        if colnm > 3:
+            return lines
+        return ["\n".join(lines)]
+
+    def __call__(self, fnm, from_page=0, to_page=100000000):
+        self.doc = Document(fnm) if isinstance(
+            fnm, str) else Document(BytesIO(fnm))
+        pn = 0 # parsed page
+        secs = [] # parsed contents
+        for p in self.doc.paragraphs:
+            if pn > to_page:
+                break
+
+            runs_within_single_paragraph = [] # save runs within the range of pages
+            for run in p.runs:
+                if pn > to_page:
+                    break
+                if from_page <= pn < to_page and p.text.strip():
+                    runs_within_single_paragraph.append(run.text) # append run.text first
+
+                # wrap page break checker into a static method
+                if 'lastRenderedPageBreak' in run._element.xml:
+                    pn += 1
+
+            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
+
+        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
+        return secs, tbls
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -0,0 +1,189 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import re
+import sys
+from io import BytesIO
+
+import pandas as pd
+from openpyxl import Workbook, load_workbook
+
+from rag.nlp import find_codec
+
+# copied from `/openpyxl/cell/cell.py`
+ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
+
+
+class RAGFlowExcelParser:
+    @staticmethod
+    def _load_excel_to_workbook(file_like_object):
+        if isinstance(file_like_object, bytes):
+            file_like_object = BytesIO(file_like_object)
+
+        # Read first 4 bytes to determine file type
+        file_like_object.seek(0)
+        file_head = file_like_object.read(4)
+        file_like_object.seek(0)
+
+        if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
+            logging.info("Not an Excel file, converting CSV to Excel Workbook")
+
+            try:
+                file_like_object.seek(0)
+                df = pd.read_csv(file_like_object)
+                return RAGFlowExcelParser._dataframe_to_workbook(df)
+
+            except Exception as e_csv:
+                raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
+
+        try:
+            return load_workbook(file_like_object, data_only=True)
+        except Exception as e:
+            logging.info(f"openpyxl load error: {e}, try pandas instead")
+            try:
+                file_like_object.seek(0)
+                try:
+                    df = pd.read_excel(file_like_object)
+                    return RAGFlowExcelParser._dataframe_to_workbook(df)
+                except Exception as ex:
+                    logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
+                    file_like_object.seek(0)
+                    df = pd.read_excel(file_like_object, engine="calamine")
+                    return RAGFlowExcelParser._dataframe_to_workbook(df)
+            except Exception as e_pandas:
+                raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
+
+    @staticmethod
+    def _clean_dataframe(df: pd.DataFrame):
+        def clean_string(s):
+            if isinstance(s, str):
+                return ILLEGAL_CHARACTERS_RE.sub(" ", s)
+            return s
+
+        return df.apply(lambda col: col.map(clean_string))
+
+    @staticmethod
+    def _dataframe_to_workbook(df):
+        df = RAGFlowExcelParser._clean_dataframe(df)
+        wb = Workbook()
+        ws = wb.active
+        ws.title = "Data"
+
+        for col_num, column_name in enumerate(df.columns, 1):
+            ws.cell(row=1, column=col_num, value=column_name)
+
+        for row_num, row in enumerate(df.values, 2):
+            for col_num, value in enumerate(row, 1):
+                ws.cell(row=row_num, column=col_num, value=value)
+
+        return wb
+
+    def html(self, fnm, chunk_rows=256):
+        from html import escape
+
+        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
+        wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
+        tb_chunks = []
+
+        def _fmt(v):
+            if v is None:
+                return ""
+            return str(v).strip()
+
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            if not rows:
+                continue
+
+            tb_rows_0 = "<tr>"
+            for t in list(rows[0]):
+                tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
+            tb_rows_0 += "</tr>"
+
+            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
+                tb = ""
+                tb += f"<table><caption>{sheetname}</caption>"
+                tb += tb_rows_0
+                for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
+                    tb += "<tr>"
+                    for i, c in enumerate(r):
+                        if c.value is None:
+                            tb += "<td></td>"
+                        else:
+                            tb += f"<td>{escape(_fmt(c.value))}</td>"
+                    tb += "</tr>"
+                tb += "</table>\n"
+                tb_chunks.append(tb)
+
+        return tb_chunks
+
+    def markdown(self, fnm):
+        import pandas as pd
+
+        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
+        try:
+            file_like_object.seek(0)
+            df = pd.read_excel(file_like_object)
+        except Exception as e:
+            logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
+            file_like_object.seek(0)
+            df = pd.read_csv(file_like_object)
+        df = df.replace(r"^\s*$", "", regex=True)
+        return df.to_markdown(index=False)
+
+    def __call__(self, fnm):
+        file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
+        wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
+
+        res = []
+        for sheetname in wb.sheetnames:
+            ws = wb[sheetname]
+            rows = list(ws.rows)
+            if not rows:
+                continue
+            ti = list(rows[0])
+            for r in list(rows[1:]):
+                fields = []
+                for i, c in enumerate(r):
+                    if not c.value:
+                        continue
+                    t = str(ti[i].value) if i < len(ti) else ""
+                    t += ("：" if t else "") + str(c.value)
+                    fields.append(t)
+                line = "; ".join(fields)
+                if sheetname.lower().find("sheet") < 0:
+                    line += " ——" + sheetname
+                res.append(line)
+        return res
+
+    @staticmethod
+    def row_number(fnm, binary):
+        if fnm.split(".")[-1].lower().find("xls") >= 0:
+            wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
+            total = 0
+            for sheetname in wb.sheetnames:
+                ws = wb[sheetname]
+                total += len(list(ws.rows))
+            return total
+
+        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+            return len(txt.split("\n"))
+
+
+if __name__ == "__main__":
+    psr = RAGFlowExcelParser()
+    psr(sys.argv[1])
--- a/deepdoc/parser/figure_parser.py
+++ b/deepdoc/parser/figure_parser.py
@@ -0,0 +1,105 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+
+from api.utils.api_utils import timeout
+from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
+from rag.prompts.generator import vision_llm_figure_describe_prompt
+
+
+def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
+    return [
+        (
+            (figure_data[1], [figure_data[0]]),
+            [(0, 0, 0, 0, 0)],
+        )
+        for figure_data in figures_data_without_positions
+        if isinstance(figure_data[1], Image.Image)
+    ]
+
+
+shared_executor = ThreadPoolExecutor(max_workers=10)
+
+
+class VisionFigureParser:
+    def __init__(self, vision_model, figures_data, *args, **kwargs):
+        self.vision_model = vision_model
+        self._extract_figures_info(figures_data)
+        assert len(self.figures) == len(self.descriptions)
+        assert not self.positions or (len(self.figures) == len(self.positions))
+
+    def _extract_figures_info(self, figures_data):
+        self.figures = []
+        self.descriptions = []
+        self.positions = []
+
+        for item in figures_data:
+            # position
+            if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
+                img_desc = item[0]
+                assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
+                self.figures.append(img_desc[0])
+                self.descriptions.append(img_desc[1])
+                self.positions.append(item[1])
+            else:
+                assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
+                self.figures.append(item[0])
+                self.descriptions.append(item[1])
+
+    def _assemble(self):
+        self.assembled = []
+        self.has_positions = len(self.positions) != 0
+        for i in range(len(self.figures)):
+            figure = self.figures[i]
+            desc = self.descriptions[i]
+            pos = self.positions[i] if self.has_positions else None
+
+            figure_desc = (figure, desc)
+
+            if pos is not None:
+                self.assembled.append((figure_desc, pos))
+            else:
+                self.assembled.append((figure_desc,))
+
+        return self.assembled
+
+    def __call__(self, **kwargs):
+        callback = kwargs.get("callback", lambda prog, msg: None)
+
+        @timeout(30, 3)
+        def process(figure_idx, figure_binary):
+            description_text = picture_vision_llm_chunk(
+                binary=figure_binary,
+                vision_model=self.vision_model,
+                prompt=vision_llm_figure_describe_prompt(),
+                callback=callback,
+            )
+            return figure_idx, description_text
+
+        futures = []
+        for idx, img_binary in enumerate(self.figures or []):
+            futures.append(shared_executor.submit(process, idx, img_binary))
+
+        for future in as_completed(futures):
+            figure_num, txt = future.result()
+            if txt:
+                self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
+
+        self._assemble()
+
+        return self.assembled
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -0,0 +1,214 @@
+# -*- coding: utf-8 -*-
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from rag.nlp import find_codec, rag_tokenizer
+import uuid
+import chardet
+from bs4 import BeautifulSoup, NavigableString, Tag, Comment
+import html
+
+def get_encoding(file):
+    with open(file,'rb') as f:
+        tmp = chardet.detect(f.read())
+        return tmp['encoding']
+
+BLOCK_TAGS = [
+    "h1", "h2", "h3", "h4", "h5", "h6",
+    "p", "div", "article", "section", "aside",
+    "ul", "ol", "li",
+    "table", "pre", "code", "blockquote",
+    "figure", "figcaption"
+]
+TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
+
+
+class RAGFlowHtmlParser:
+    def __call__(self, fnm, binary=None, chunk_token_num=512):
+        if binary:
+            encoding = find_codec(binary)
+            txt = binary.decode(encoding, errors="ignore")
+        else:
+            with open(fnm, "r",encoding=get_encoding(fnm)) as f:
+                txt = f.read()
+        return self.parser_txt(txt, chunk_token_num)
+
+    @classmethod
+    def parser_txt(cls, txt, chunk_token_num):
+        if not isinstance(txt, str):
+            raise TypeError("txt type should be string!")
+
+        temp_sections = []
+        soup = BeautifulSoup(txt, "html5lib")
+        # delete <style> tag
+        for style_tag in soup.find_all(["style", "script"]):
+            style_tag.decompose()
+        # delete <script> tag in <div>
+        for div_tag in soup.find_all("div"):
+            for script_tag in div_tag.find_all("script"):
+                script_tag.decompose()
+        # delete inline style
+        for tag in soup.find_all(True):
+            if 'style' in tag.attrs:
+                del tag.attrs['style']
+        # delete HTML comment
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+
+        cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
+        block_txt_list, table_list = cls.merge_block_text(temp_sections)
+        sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
+        for table in table_list:
+            sections.append(table.get("content", ""))
+        return sections
+
+    @classmethod
+    def split_table(cls, html_table, chunk_token_num=512):
+        soup = BeautifulSoup(html_table, "html.parser")
+        rows = soup.find_all("tr")
+        tables = []
+        current_table = []
+        current_count = 0
+        table_str_list = []
+        for row in rows:
+            tks_str = rag_tokenizer.tokenize(str(row))
+            token_count = len(tks_str.split(" ")) if tks_str else 0
+            if current_count + token_count > chunk_token_num:
+                tables.append(current_table)
+                current_table = []
+                current_count = 0
+            current_table.append(row)
+            current_count += token_count
+        if current_table:
+            tables.append(current_table)
+
+        for table_rows in tables:
+            new_table = soup.new_tag("table")
+            for row in table_rows:
+                new_table.append(row)
+            table_str_list.append(str(new_table))
+
+        return table_str_list
+
+    @classmethod
+    def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
+        if isinstance(element, NavigableString):
+            content = element.strip()
+
+            def is_valid_html(content):
+                try:
+                    soup = BeautifulSoup(content, "html.parser")
+                    return bool(soup.find())
+                except Exception:
+                    return False
+
+            return_info = []
+            if content:
+                if is_valid_html(content):
+                    soup = BeautifulSoup(content, "html.parser")
+                    child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
+                    parser_result.extend(child_info)
+                else:
+                    info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
+                    if parent_name:
+                        info["tag_name"] = parent_name
+                    return_info.append(info)
+            return return_info
+        elif isinstance(element, Tag):
+
+            if str.lower(element.name) == "table":
+                table_info_list = []
+                table_id = str(uuid.uuid1())
+                table_list = [html.unescape(str(element))]
+                for t in table_list:
+                    table_info_list.append({"content": t, "tag_name": "table",
+                                            "metadata": {"table_id": table_id, "index": table_list.index(t)}})
+                return table_info_list
+            else:
+                block_id = None
+                if str.lower(element.name) in BLOCK_TAGS:
+                    block_id = str(uuid.uuid1())
+                for child in element.children:
+                    child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
+                                                           block_id)
+                    parser_result.extend(child_info)
+        return []
+
+    @classmethod
+    def merge_block_text(cls, parser_result):
+        block_content = []
+        current_content = ""
+        table_info_list = []
+        lask_block_id = None
+        for item in parser_result:
+            content = item.get("content")
+            tag_name = item.get("tag_name")
+            title_flag = tag_name in TITLE_TAGS
+            block_id = item.get("metadata", {}).get("block_id")
+            if block_id:
+                if title_flag:
+                    content = f"{TITLE_TAGS[tag_name]} {content}"
+                if lask_block_id != block_id:
+                    if lask_block_id is not None:
+                        block_content.append(current_content)
+                    current_content = content
+                    lask_block_id = block_id
+                else:
+                    current_content += (" " if current_content else "") + content
+            else:
+                if tag_name == "table":
+                    table_info_list.append(item)
+                else:
+                    current_content += (" " if current_content else "" + content)
+        if current_content:
+            block_content.append(current_content)
+        return block_content, table_info_list
+
+    @classmethod
+    def chunk_block(cls, block_txt_list, chunk_token_num=512):
+        chunks = []
+        current_block = ""
+        current_token_count = 0
+
+        for block in block_txt_list:
+            tks_str = rag_tokenizer.tokenize(block)
+            block_token_count = len(tks_str.split(" ")) if tks_str else 0
+            if block_token_count > chunk_token_num:
+                if current_block:
+                    chunks.append(current_block)
+                start = 0
+                tokens = tks_str.split(" ")
+                while start < len(tokens):
+                    end = start + chunk_token_num
+                    split_tokens = tokens[start:end]
+                    chunks.append(" ".join(split_tokens))
+                    start = end
+                current_block = ""
+                current_token_count = 0
+            else:
+                if current_token_count + block_token_count <= chunk_token_num:
+                    current_block += ("\n" if current_block else "") + block
+                    current_token_count += block_token_count
+                else:
+                    chunks.append(current_block)
+                    current_block = block
+                    current_token_count = block_token_count
+
+        if current_block:
+            chunks.append(current_block)
+
+        return chunks
+
--- a/deepdoc/parser/json_parser.py
+++ b/deepdoc/parser/json_parser.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# The following documents are mainly referenced, and only adaptation modifications have been made
+# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
+
+import json
+from typing import Any
+
+from rag.nlp import find_codec
+
+
+class RAGFlowJsonParser:
+    def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
+        super().__init__()
+        self.max_chunk_size = max_chunk_size * 2
+        self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
+
+    def __call__(self, binary):
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+
+        if self.is_jsonl_format(txt):
+            sections = self._parse_jsonl(txt)
+        else:
+            sections = self._parse_json(txt)
+        return sections
+
+    @staticmethod
+    def _json_size(data: dict) -> int:
+        """Calculate the size of the serialized JSON object."""
+        return len(json.dumps(data, ensure_ascii=False))
+
+    @staticmethod
+    def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
+        """Set a value in a nested dictionary based on the given path."""
+        for key in path[:-1]:
+            d = d.setdefault(key, {})
+        d[path[-1]] = value
+
+    def _list_to_dict_preprocessing(self, data: Any) -> Any:
+        if isinstance(data, dict):
+            # Process each key-value pair in the dictionary
+            return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            # Convert the list to a dictionary with index-based keys
+            return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
+        else:
+            # Base case: the item is neither a dict nor a list, so return it unchanged
+            return data
+
+    def _json_split(
+        self,
+        data,
+        current_path: list[str] | None,
+        chunks: list[dict] | None,
+    ) -> list[dict]:
+        """
+        Split json into maximum size dictionaries while preserving structure.
+        """
+        current_path = current_path or []
+        chunks = chunks or [{}]
+        if isinstance(data, dict):
+            for key, value in data.items():
+                new_path = current_path + [key]
+                chunk_size = self._json_size(chunks[-1])
+                size = self._json_size({key: value})
+                remaining = self.max_chunk_size - chunk_size
+
+                if size < remaining:
+                    # Add item to current chunk
+                    self._set_nested_dict(chunks[-1], new_path, value)
+                else:
+                    if chunk_size >= self.min_chunk_size:
+                        # Chunk is big enough, start a new chunk
+                        chunks.append({})
+
+                    # Iterate
+                    self._json_split(value, new_path, chunks)
+        else:
+            # handle single item
+            self._set_nested_dict(chunks[-1], current_path, data)
+        return chunks
+
+    def split_json(
+        self,
+        json_data,
+        convert_lists: bool = False,
+    ) -> list[dict]:
+        """Splits JSON into a list of JSON chunks"""
+
+        if convert_lists:
+            preprocessed_data = self._list_to_dict_preprocessing(json_data)
+            chunks = self._json_split(preprocessed_data, None, None)
+        else:
+            chunks = self._json_split(json_data, None, None)
+
+        # Remove the last chunk if it's empty
+        if not chunks[-1]:
+            chunks.pop()
+        return chunks
+
+    def split_text(
+        self,
+        json_data: dict[str, Any],
+        convert_lists: bool = False,
+        ensure_ascii: bool = True,
+    ) -> list[str]:
+        """Splits JSON into a list of JSON formatted strings"""
+
+        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
+
+        # Convert to string
+        return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
+
+    def _parse_json(self, content: str) -> list[str]:
+        sections = []
+        try:
+            json_data = json.loads(content)
+            chunks = self.split_json(json_data, True)
+            sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
+        except json.JSONDecodeError:
+            pass
+        return sections
+
+    def _parse_jsonl(self, content: str) -> list[str]:
+        lines = content.strip().splitlines()
+        all_chunks = []
+        for line in lines:
+            if not line.strip():
+                continue
+            try:
+                data = json.loads(line)
+                chunks = self.split_json(data, convert_lists=True)
+                all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
+            except json.JSONDecodeError:
+                continue
+        return all_chunks
+
+    def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
+        lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
+        if not lines:
+            return False
+
+        try:
+            json.loads(txt)
+            return False
+        except json.JSONDecodeError:
+            pass
+
+        sample_limit = min(len(lines), sample_limit)
+        sample_lines = lines[:sample_limit]
+        valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
+
+        if not valid_lines:
+            return False
+
+        return (valid_lines / len(sample_lines)) >= threshold
+
+    def _is_valid_json(self, line: str) -> bool:
+        try:
+            json.loads(line)
+            return True
+        except json.JSONDecodeError:
+            return False
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -0,0 +1,273 @@
+# -*- coding: utf-8 -*-
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import re
+
+import mistune
+from markdown import markdown
+
+
+class RAGFlowMarkdownParser:
+    def __init__(self, chunk_token_num=128):
+        self.chunk_token_num = int(chunk_token_num)
+
+    def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
+        tables = []
+        working_text = markdown_text
+
+        def replace_tables_with_rendered_html(pattern, table_list, render=True):
+            new_text = ""
+            last_end = 0
+            for match in pattern.finditer(working_text):
+                raw_table = match.group()
+                table_list.append(raw_table)
+                if separate_tables:
+                    # Skip this match (i.e., remove it)
+                    new_text += working_text[last_end : match.start()] + "\n\n"
+                else:
+                    # Replace with rendered HTML
+                    html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
+                    new_text += working_text[last_end : match.start()] + html_table + "\n\n"
+                last_end = match.end()
+            new_text += working_text[last_end:]
+            return new_text
+
+        if "|" in markdown_text:  # for optimize performance
+            # Standard Markdown table
+            border_table_pattern = re.compile(
+                r"""
+                (?:\n|^)
+                (?:\|.*?\|.*?\|.*?\n)
+                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
+                (?:\|.*?\|.*?\|.*?\n)+
+            """,
+                re.VERBOSE,
+            )
+            working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
+
+            # Borderless Markdown table
+            no_border_table_pattern = re.compile(
+                r"""
+                (?:\n|^)
+                (?:\S.*?\|.*?\n)
+                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
+                (?:\S.*?\|.*?\n)+
+                """,
+                re.VERBOSE,
+            )
+            working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
+
+        if "<table>" in working_text.lower():  # for optimize performance
+            # HTML table extraction - handle possible html/body wrapper tags
+            html_table_pattern = re.compile(
+                r"""
+            (?:\n|^)
+            \s*
+            (?:
+                # case1: <html><body><table>...</table></body></html>
+                (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
+                |
+                # case2: <body><table>...</table></body>
+                (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
+                |
+                # case3: only<table>...</table>
+                (?:<table[^>]*>.*?</table>)
+            )
+            \s*
+            (?=\n|$)
+            """,
+                re.VERBOSE | re.DOTALL | re.IGNORECASE,
+            )
+
+            def replace_html_tables():
+                nonlocal working_text
+                new_text = ""
+                last_end = 0
+                for match in html_table_pattern.finditer(working_text):
+                    raw_table = match.group()
+                    tables.append(raw_table)
+                    if separate_tables:
+                        new_text += working_text[last_end : match.start()] + "\n\n"
+                    else:
+                        new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
+                    last_end = match.end()
+                new_text += working_text[last_end:]
+                working_text = new_text
+
+            replace_html_tables()
+
+        return working_text, tables
+
+
+class MarkdownElementExtractor:
+    def __init__(self, markdown_content):
+        self.markdown_content = markdown_content
+        self.lines = markdown_content.split("\n")
+        self.ast_parser = mistune.create_markdown(renderer="ast")
+        self.ast_nodes = self.ast_parser(markdown_content)
+
+    def extract_elements(self):
+        """Extract individual elements (headers, code blocks, lists, etc.)"""
+        sections = []
+
+        i = 0
+        while i < len(self.lines):
+            line = self.lines[i]
+
+            if re.match(r"^#{1,6}\s+.*$", line):
+                # header
+                element = self._extract_header(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif line.strip().startswith("```"):
+                # code block
+                element = self._extract_code_block(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
+                # list block
+                element = self._extract_list_block(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif line.strip().startswith(">"):
+                # blockquote
+                element = self._extract_blockquote(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            elif line.strip():
+                # text block (paragraphs and inline elements until next block element)
+                element = self._extract_text_block(i)
+                sections.append(element["content"])
+                i = element["end_line"] + 1
+            else:
+                i += 1
+
+        sections = [section for section in sections if section.strip()]
+        return sections
+
+    def _extract_header(self, start_pos):
+        return {
+            "type": "header",
+            "content": self.lines[start_pos],
+            "start_line": start_pos,
+            "end_line": start_pos,
+        }
+
+    def _extract_code_block(self, start_pos):
+        end_pos = start_pos
+        content_lines = [self.lines[start_pos]]
+
+        # Find the end of the code block
+        for i in range(start_pos + 1, len(self.lines)):
+            content_lines.append(self.lines[i])
+            end_pos = i
+            if self.lines[i].strip().startswith("```"):
+                break
+
+        return {
+            "type": "code_block",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
+
+    def _extract_list_block(self, start_pos):
+        end_pos = start_pos
+        content_lines = []
+
+        i = start_pos
+        while i < len(self.lines):
+            line = self.lines[i]
+            # check if this line is a list item or continuation of a list
+            if (
+                re.match(r"^\s*[-*+]\s+.*$", line)
+                or re.match(r"^\s*\d+\.\s+.*$", line)
+                or (i > start_pos and not line.strip())
+                or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
+                or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
+                or (i > start_pos and re.match(r"^\s+\w+.*$", line))
+            ):
+                content_lines.append(line)
+                end_pos = i
+                i += 1
+            else:
+                break
+
+        return {
+            "type": "list_block",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
+
+    def _extract_blockquote(self, start_pos):
+        end_pos = start_pos
+        content_lines = []
+
+        i = start_pos
+        while i < len(self.lines):
+            line = self.lines[i]
+            if line.strip().startswith(">") or (i > start_pos and not line.strip()):
+                content_lines.append(line)
+                end_pos = i
+                i += 1
+            else:
+                break
+
+        return {
+            "type": "blockquote",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
+
+    def _extract_text_block(self, start_pos):
+        """Extract a text block (paragraphs, inline elements) until next block element"""
+        end_pos = start_pos
+        content_lines = [self.lines[start_pos]]
+
+        i = start_pos + 1
+        while i < len(self.lines):
+            line = self.lines[i]
+            # stop if we encounter a block element
+            if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
+                break
+            elif not line.strip():
+                # check if the next line is a block element
+                if i + 1 < len(self.lines) and (
+                    re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
+                    or self.lines[i + 1].strip().startswith("```")
+                    or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
+                    or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
+                    or self.lines[i + 1].strip().startswith(">")
+                ):
+                    break
+                else:
+                    content_lines.append(line)
+                    end_pos = i
+                    i += 1
+            else:
+                content_lines.append(line)
+                end_pos = i
+                i += 1
+
+        return {
+            "type": "text_block",
+            "content": "\n".join(content_lines),
+            "start_line": start_pos,
+            "end_line": end_pos,
+        }
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@@ -0,0 +1,99 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from io import BytesIO
+from pptx import Presentation
+
+
+class RAGFlowPptParser:
+    def __init__(self):
+        super().__init__()
+
+    def __get_bulleted_text(self, paragraph):
+        is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
+        if is_bulleted:
+            return f"{'  '* paragraph.level}.{paragraph.text}"
+        else:
+            return paragraph.text
+
+    def __extract(self, shape):
+        try:
+            # First try to get text content
+            if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
+                text_frame = shape.text_frame
+                texts = []
+                for paragraph in text_frame.paragraphs:
+                    if paragraph.text.strip():
+                        texts.append(self.__get_bulleted_text(paragraph))
+                return "\n".join(texts)
+
+            # Safely get shape_type
+            try:
+                shape_type = shape.shape_type
+            except NotImplementedError:
+                # If shape_type is not available, try to get text content
+                if hasattr(shape, 'text'):
+                    return shape.text.strip()
+                return ""
+
+            # Handle table
+            if shape_type == 19:
+                tb = shape.table
+                rows = []
+                for i in range(1, len(tb.rows)):
+                    rows.append("; ".join([tb.cell(
+                        0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                return "\n".join(rows)
+
+            # Handle group shape
+            if shape_type == 6:
+                texts = []
+                for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
+                    t = self.__extract(p)
+                    if t:
+                        texts.append(t)
+                return "\n".join(texts)
+
+            return ""
+
+        except Exception as e:
+            logging.error(f"Error processing shape: {str(e)}")
+            return ""
+
+    def __call__(self, fnm, from_page, to_page, callback=None):
+        ppt = Presentation(fnm) if isinstance(
+            fnm, str) else Presentation(
+            BytesIO(fnm))
+        txts = []
+        self.total_page = len(ppt.slides)
+        for i, slide in enumerate(ppt.slides):
+            if i < from_page:
+                continue
+            if i >= to_page:
+                break
+            texts = []
+            for shape in sorted(
+                    slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
+                try:
+                    txt = self.__extract(shape)
+                    if txt:
+                        texts.append(txt)
+                except Exception as e:
+                    logging.exception(e)
+            txts.append("\n".join(texts))
+
+        return txts
--- a/deepdoc/parser/resume/init.py
+++ b/deepdoc/parser/resume/init.py
@@ -0,0 +1,109 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import datetime
+
+
+def refactor(cv):
+    for n in [
+        "raw_txt",
+        "parser_name",
+        "inference",
+        "ori_text",
+        "use_time",
+        "time_stat",
+    ]:
+        if n in cv and cv[n] is not None:
+            del cv[n]
+    cv["is_deleted"] = 0
+    if "basic" not in cv:
+        cv["basic"] = {}
+    if cv["basic"].get("photo2"):
+        del cv["basic"]["photo2"]
+
+    for n in [
+        "education",
+        "work",
+        "certificate",
+        "project",
+        "language",
+        "skill",
+        "training",
+    ]:
+        if n not in cv or cv[n] is None:
+            continue
+        if isinstance(cv[n], dict):
+            cv[n] = [v for _, v in cv[n].items()]
+        if not isinstance(cv[n], list):
+            del cv[n]
+            continue
+        vv = []
+        for v in cv[n]:
+            if "external" in v and v["external"] is not None:
+                del v["external"]
+            vv.append(v)
+        cv[n] = {str(i): vv[i] for i in range(len(vv))}
+
+    basics = [
+        ("basic_salary_month", "salary_month"),
+        ("expect_annual_salary_from", "expect_annual_salary"),
+    ]
+    for n, t in basics:
+        if cv["basic"].get(n):
+            cv["basic"][t] = cv["basic"][n]
+            del cv["basic"][n]
+
+    work = sorted(
+        [v for _, v in cv.get("work", {}).items()],
+        key=lambda x: x.get("start_time", ""),
+    )
+    edu = sorted(
+        [v for _, v in cv.get("education", {}).items()],
+        key=lambda x: x.get("start_time", ""),
+    )
+
+    if work:
+        cv["basic"]["work_start_time"] = work[0].get("start_time", "")
+        cv["basic"]["management_experience"] = (
+            "Y"
+            if any([w.get("management_experience", "") == "Y" for w in work])
+            else "N"
+        )
+        cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
+
+        for n in [
+            "annual_salary_from",
+            "annual_salary_to",
+            "industry_name",
+            "position_name",
+            "responsibilities",
+            "corporation_type",
+            "scale",
+            "corporation_name",
+        ]:
+            cv["basic"][n] = work[-1].get(n, "")
+
+    if edu:
+        for n in ["school_name", "discipline_name"]:
+            if n in edu[-1]:
+                cv["basic"][n] = edu[-1][n]
+
+    cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    if "contact" not in cv:
+        cv["contact"] = {}
+    if not cv["contact"].get("name"):
+        cv["contact"]["name"] = cv["basic"].get("name", "")
+    return cv
--- a/deepdoc/parser/resume/entities/init.py
+++ b/deepdoc/parser/resume/entities/init.py
@@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
@@ -0,0 +1,128 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import re
+import json
+import os
+import pandas as pd
+from rag.nlp import rag_tokenizer
+from . import regions
+
+
+current_file_path = os.path.dirname(os.path.abspath(__file__))
+GOODS = pd.read_csv(
+    os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
+).fillna(0)
+GOODS["cid"] = GOODS["cid"].astype(str)
+GOODS = GOODS.set_index(["cid"])
+CORP_TKS = json.load(
+    open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r",encoding="utf-8")
+)
+GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r",encoding="utf-8"))
+CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r",encoding="utf-8"))
+
+
+def baike(cid, default_v=0):
+    global GOODS
+    try:
+        return GOODS.loc[str(cid), "len"]
+    except Exception:
+        pass
+    return default_v
+
+
+def corpNorm(nm, add_region=True):
+    global CORP_TKS
+    if not nm or not isinstance(nm, str):
+        return ""
+    nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
+    nm = re.sub(r"&amp;", "&", nm)
+    nm = re.sub(r"[\(\)（）\+'\"\t \*\\【】-]+", " ", nm)
+    nm = re.sub(
+        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
+    )
+    nm = re.sub(
+        r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
+        "",
+        nm,
+        count=10000,
+        flags=re.IGNORECASE,
+    )
+    if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
+        return nm
+
+    tks = rag_tokenizer.tokenize(nm).split()
+    reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
+    nm = ""
+    for t in tks:
+        if regions.isName(t) or t in CORP_TKS:
+            continue
+        if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
+            nm += " "
+        nm += t
+
+    r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
+    if r:
+        nm = r.group(1)
+    r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
+    if r:
+        nm = r.group(1)
+    return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
+
+
+def rmNoise(n):
+    n = re.sub(r"[\(（][^()（）]+[)）]", "", n)
+    n = re.sub(r"[,. &（）()]+", "", n)
+    return n
+
+
+GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
+for c, v in CORP_TAG.items():
+    cc = corpNorm(rmNoise(c), False)
+    if not cc:
+        logging.debug(c)
+CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
+
+
+def is_good(nm):
+    global GOOD_CORP
+    if nm.find("外派") >= 0:
+        return False
+    nm = rmNoise(nm)
+    nm = corpNorm(nm, False)
+    for n in GOOD_CORP:
+        if re.match(r"[0-9a-zA-Z]+$", n):
+            if n == nm:
+                return True
+        elif nm.find(n) >= 0:
+            return True
+    return False
+
+
+def corp_tag(nm):
+    global CORP_TAG
+    nm = rmNoise(nm)
+    nm = corpNorm(nm, False)
+    for n in CORP_TAG.keys():
+        if re.match(r"[0-9a-zA-Z., ]+$", n):
+            if n == nm:
+                return CORP_TAG[n]
+        elif nm.find(n) >= 0:
+            if len(n) < 3 and len(nm) / len(n) >= 2:
+                continue
+            return CORP_TAG[n]
+    return []
--- a/deepdoc/parser/resume/entities/degrees.py
+++ b/deepdoc/parser/resume/entities/degrees.py
@@ -0,0 +1,44 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+TBL = {
+    "94": "EMBA",
+    "6": "MBA",
+    "95": "MPA",
+    "92": "专升本",
+    "4": "专科",
+    "90": "中专",
+    "91": "中技",
+    "86": "初中",
+    "3": "博士",
+    "10": "博士后",
+    "1": "本科",
+    "2": "硕士",
+    "87": "职高",
+    "89": "高中",
+}
+
+TBL_ = {v: k for k, v in TBL.items()}
+
+
+def get_name(id):
+    return TBL.get(str(id), "")
+
+
+def get_id(nm):
+    if not nm:
+        return ""
+    return TBL_.get(nm.upper().strip(), "")
--- a/deepdoc/parser/resume/entities/industries.py
+++ b/deepdoc/parser/resume/entities/industries.py
@@ -0,0 +1,712 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+TBL = {
+    "1": {"name": "IT/通信/电子", "parent": "0"},
+    "2": {"name": "互联网", "parent": "0"},
+    "3": {"name": "电子商务", "parent": "2"},
+    "4": {"name": "互联网金融", "parent": "2"},
+    "5": {"name": "网络游戏", "parent": "2"},
+    "6": {"name": "社交网络平台", "parent": "2"},
+    "7": {"name": "视频音乐", "parent": "2"},
+    "9": {"name": "安全", "parent": "2"},
+    "10": {"name": "云计算", "parent": "2"},
+    "12": {"name": "工具类客户端应用", "parent": "2"},
+    "13": {"name": "互联网广告", "parent": "2"},
+    "14": {"name": "企业互联网服务", "parent": "2"},
+    "16": {"name": "在线教育", "parent": "2"},
+    "17": {"name": "在线医疗", "parent": "2"},
+    "19": {"name": "B2B", "parent": "3"},
+    "20": {"name": "B2C", "parent": "3"},
+    "21": {"name": "C2C", "parent": "3"},
+    "22": {"name": "生活信息本地化", "parent": "3"},
+    "23": {"name": "在线旅游", "parent": "2"},
+    "24": {"name": "第三方支付", "parent": "4"},
+    "26": {"name": "客户端游戏", "parent": "5"},
+    "27": {"name": "网页游戏", "parent": "5"},
+    "28": {"name": "手机游戏", "parent": "5"},
+    "29": {"name": "微博", "parent": "6"},
+    "30": {"name": "社交网站", "parent": "6"},
+    "31": {"name": "在线视频", "parent": "7"},
+    "32": {"name": "在线音乐", "parent": "7"},
+    "35": {"name": "企业安全", "parent": "9"},
+    "36": {"name": "个人安全", "parent": "9"},
+    "37": {"name": "企业级云服务", "parent": "10"},
+    "38": {"name": "个人级云服务", "parent": "10"},
+    "43": {"name": "输入法", "parent": "12"},
+    "44": {"name": "浏览器", "parent": "12"},
+    "45": {"name": "词典", "parent": "12"},
+    "46": {"name": "播放器", "parent": "12"},
+    "47": {"name": "下载器", "parent": "12"},
+    "48": {"name": "IM", "parent": "12"},
+    "49": {"name": "广告服务", "parent": "13"},
+    "50": {"name": "第三方广告网络平台", "parent": "13"},
+    "51": {"name": "媒体代理", "parent": "13"},
+    "52": {"name": "创意代理", "parent": "13"},
+    "53": {"name": "IT-综合", "parent": "1"},
+    "71": {"name": "团购", "parent": "3"},
+    "72": {"name": "地图", "parent": "2"},
+    "73": {"name": "数据存储", "parent": "2"},
+    "414": {"name": "计算机软件", "parent": "1"},
+    "415": {"name": "计算机硬件", "parent": "1"},
+    "416": {"name": "计算机服务(系统、数据服务、维修)", "parent": "1"},
+    "417": {"name": "通信/电信/网络设备", "parent": "1"},
+    "418": {"name": "通信/电信运营、增值服务", "parent": "1"},
+    "419": {"name": "电子技术/半导体/集成电路", "parent": "1"},
+    "472": {"name": "P2P网贷", "parent": "4"},
+    "473": {"name": "互联网理财", "parent": "4"},
+    "474": {"name": "婚恋", "parent": "6"},
+    "476": {"name": "虚拟化", "parent": "10"},
+    "477": {"name": "邮箱", "parent": "12"},
+    "478": {"name": "商业智能", "parent": "14"},
+    "479": {"name": "企业建站", "parent": "14"},
+    "480": {"name": "安防", "parent": "14"},
+    "481": {"name": "网络营销", "parent": "2"},
+    "487": {"name": "智能终端", "parent": "2"},
+    "488": {"name": "移动互联网", "parent": "2"},
+    "489": {"name": "数字城市", "parent": "2"},
+    "490": {"name": "大数据", "parent": "2"},
+    "491": {"name": "互联网人力资源", "parent": "2"},
+    "492": {"name": "舆情监控", "parent": "2"},
+    "493": {"name": "移动营销", "parent": "481"},
+    "494": {"name": "微博营销", "parent": "481"},
+    "495": {"name": "精准营销", "parent": "481"},
+    "496": {"name": "海外营销", "parent": "481"},
+    "497": {"name": "微信营销", "parent": "481"},
+    "498": {"name": "智能手机", "parent": "487"},
+    "499": {"name": "可穿戴设备", "parent": "487"},
+    "500": {"name": "智能电视", "parent": "487"},
+    "501": {"name": "WAP", "parent": "488"},
+    "502": {"name": "物联网", "parent": "489"},
+    "503": {"name": "O2O", "parent": "489"},
+    "504": {"name": "数字出版", "parent": "489"},
+    "505": {"name": "搜索", "parent": "2"},
+    "506": {"name": "垂直搜索", "parent": "505"},
+    "507": {"name": "无线搜索", "parent": "505"},
+    "508": {"name": "网页搜索", "parent": "505"},
+    "509": {"name": "网址导航", "parent": "2"},
+    "510": {"name": "门户", "parent": "2"},
+    "511": {"name": "网络文学", "parent": "2"},
+    "512": {"name": "自媒体", "parent": "2"},
+    "513": {"name": "金融", "parent": "0"},
+    "514": {"name": "建筑与房地产", "parent": "0"},
+    "515": {"name": "专业服务", "parent": "0"},
+    "516": {"name": "教育培训", "parent": "0"},
+    "517": {"name": "文化传媒", "parent": "0"},
+    "518": {"name": "消费品", "parent": "0"},
+    "519": {"name": "工业", "parent": "0"},
+    "520": {"name": "交通物流", "parent": "0"},
+    "521": {"name": "贸易", "parent": "0"},
+    "522": {"name": "医药", "parent": "0"},
+    "523": {"name": "医疗器械", "parent": "522"},
+    "524": {"name": "保健品", "parent": "518"},
+    "525": {"name": "服务业", "parent": "0"},
+    "526": {"name": "能源/矿产/环保", "parent": "0"},
+    "527": {"name": "化工", "parent": "0"},
+    "528": {"name": "政府", "parent": "0"},
+    "529": {"name": "公共事业", "parent": "0"},
+    "530": {"name": "非盈利机构", "parent": "0"},
+    "531": {"name": "农业", "parent": "1131"},
+    "532": {"name": "林业", "parent": "1131"},
+    "533": {"name": "畜牧业", "parent": "1131"},
+    "534": {"name": "渔业", "parent": "1131"},
+    "535": {"name": "学术科研", "parent": "0"},
+    "536": {"name": "零售", "parent": "0"},
+    "537": {"name": "银行", "parent": "513"},
+    "538": {"name": "保险", "parent": "513"},
+    "539": {"name": "证券", "parent": "513"},
+    "540": {"name": "基金", "parent": "513"},
+    "541": {"name": "信托", "parent": "513"},
+    "542": {"name": "担保", "parent": "513"},
+    "543": {"name": "典当", "parent": "513"},
+    "544": {"name": "拍卖", "parent": "513"},
+    "545": {"name": "投资/融资", "parent": "513"},
+    "546": {"name": "期货", "parent": "513"},
+    "547": {"name": "房地产开发", "parent": "514"},
+    "548": {"name": "工程施工", "parent": "514"},
+    "549": {"name": "建筑设计", "parent": "514"},
+    "550": {"name": "房地产代理", "parent": "514"},
+    "551": {"name": "物业管理", "parent": "514"},
+    "552": {"name": "室内设计", "parent": "514"},
+    "553": {"name": "装修装潢", "parent": "514"},
+    "554": {"name": "市政工程", "parent": "514"},
+    "555": {"name": "工程造价", "parent": "514"},
+    "556": {"name": "工程监理", "parent": "514"},
+    "557": {"name": "环境工程", "parent": "514"},
+    "558": {"name": "园林景观", "parent": "514"},
+    "559": {"name": "法律", "parent": "515"},
+    "560": {"name": "人力资源", "parent": "515"},
+    "561": {"name": "会计", "parent": "1125"},
+    "562": {"name": "审计", "parent": "515"},
+    "563": {"name": "检测认证", "parent": "515"},
+    "565": {"name": "翻译", "parent": "515"},
+    "566": {"name": "中介", "parent": "515"},
+    "567": {"name": "咨询", "parent": "515"},
+    "568": {"name": "外包服务", "parent": "515"},
+    "569": {"name": "家教", "parent": "516"},
+    "570": {"name": "早教", "parent": "516"},
+    "571": {"name": "职业技能培训", "parent": "516"},
+    "572": {"name": "外语培训", "parent": "516"},
+    "573": {"name": "设计培训", "parent": "516"},
+    "574": {"name": "IT培训", "parent": "516"},
+    "575": {"name": "文艺体育培训", "parent": "516"},
+    "576": {"name": "学历教育", "parent": "516"},
+    "577": {"name": "管理培训", "parent": "516"},
+    "578": {"name": "民办基础教育", "parent": "516"},
+    "579": {"name": "广告", "parent": "517"},
+    "580": {"name": "媒体", "parent": "517"},
+    "581": {"name": "会展", "parent": "517"},
+    "582": {"name": "公关", "parent": "517"},
+    "583": {"name": "影视", "parent": "517"},
+    "584": {"name": "艺术", "parent": "517"},
+    "585": {"name": "文化传播", "parent": "517"},
+    "586": {"name": "娱乐", "parent": "517"},
+    "587": {"name": "体育", "parent": "517"},
+    "588": {"name": "出版", "parent": "517"},
+    "589": {"name": "休闲", "parent": "517"},
+    "590": {"name": "动漫", "parent": "517"},
+    "591": {"name": "市场推广", "parent": "517"},
+    "592": {"name": "市场研究", "parent": "517"},
+    "593": {"name": "食品", "parent": "1129"},
+    "594": {"name": "饮料", "parent": "1129"},
+    "595": {"name": "烟草", "parent": "1129"},
+    "596": {"name": "酒品", "parent": "518"},
+    "597": {"name": "服饰", "parent": "518"},
+    "598": {"name": "纺织", "parent": "518"},
+    "599": {"name": "化妆品", "parent": "1129"},
+    "600": {"name": "日用品", "parent": "1129"},
+    "601": {"name": "家电", "parent": "518"},
+    "602": {"name": "家具", "parent": "518"},
+    "603": {"name": "办公用品", "parent": "518"},
+    "604": {"name": "奢侈品", "parent": "518"},
+    "605": {"name": "珠宝", "parent": "518"},
+    "606": {"name": "数码产品", "parent": "518"},
+    "607": {"name": "玩具", "parent": "518"},
+    "608": {"name": "图书", "parent": "518"},
+    "609": {"name": "音像", "parent": "518"},
+    "610": {"name": "钟表", "parent": "518"},
+    "611": {"name": "箱包", "parent": "518"},
+    "612": {"name": "母婴", "parent": "518"},
+    "613": {"name": "营养保健", "parent": "518"},
+    "614": {"name": "户外用品", "parent": "518"},
+    "615": {"name": "健身器材", "parent": "518"},
+    "616": {"name": "乐器", "parent": "518"},
+    "617": {"name": "汽车用品", "parent": "518"},
+    "619": {"name": "厨具", "parent": "518"},
+    "620": {"name": "机械制造", "parent": "519"},
+    "621": {"name": "流体控制", "parent": "519"},
+    "622": {"name": "自动化控制", "parent": "519"},
+    "623": {"name": "仪器仪表", "parent": "519"},
+    "624": {"name": "航空/航天", "parent": "519"},
+    "625": {"name": "交通设施", "parent": "519"},
+    "626": {"name": "工业电子", "parent": "519"},
+    "627": {"name": "建材", "parent": "519"},
+    "628": {"name": "五金材料", "parent": "519"},
+    "629": {"name": "汽车", "parent": "519"},
+    "630": {"name": "印刷", "parent": "519"},
+    "631": {"name": "造纸", "parent": "519"},
+    "632": {"name": "包装", "parent": "519"},
+    "633": {"name": "原材料及加工", "parent": "519"},
+    "634": {"name": "物流", "parent": "520"},
+    "635": {"name": "仓储", "parent": "520"},
+    "636": {"name": "客运", "parent": "520"},
+    "637": {"name": "快递", "parent": "520"},
+    "638": {"name": "化学药", "parent": "522"},
+    "639": {"name": "中药", "parent": "522"},
+    "640": {"name": "生物制药", "parent": "522"},
+    "641": {"name": "兽药", "parent": "522"},
+    "642": {"name": "农药", "parent": "522"},
+    "643": {"name": "CRO", "parent": "522"},
+    "644": {"name": "消毒", "parent": "522"},
+    "645": {"name": "医药商业", "parent": "522"},
+    "646": {"name": "医疗服务", "parent": "522"},
+    "647": {"name": "医疗器械", "parent": "523"},
+    "648": {"name": "制药设备", "parent": "523"},
+    "649": {"name": "医用耗材", "parent": "523"},
+    "650": {"name": "手术器械", "parent": "523"},
+    "651": {"name": "保健器材", "parent": "524"},
+    "652": {"name": "性保健品", "parent": "524"},
+    "653": {"name": "医药保养", "parent": "524"},
+    "654": {"name": "医用保健", "parent": "524"},
+    "655": {"name": "酒店", "parent": "525"},
+    "656": {"name": "餐饮", "parent": "525"},
+    "657": {"name": "旅游", "parent": "525"},
+    "658": {"name": "生活服务", "parent": "525"},
+    "659": {"name": "保健服务", "parent": "525"},
+    "660": {"name": "运动健身", "parent": "525"},
+    "661": {"name": "家政服务", "parent": "525"},
+    "662": {"name": "婚庆服务", "parent": "525"},
+    "663": {"name": "租赁服务", "parent": "525"},
+    "664": {"name": "维修服务", "parent": "525"},
+    "665": {"name": "石油天然气", "parent": "526"},
+    "666": {"name": "电力", "parent": "526"},
+    "667": {"name": "新能源", "parent": "526"},
+    "668": {"name": "水利", "parent": "526"},
+    "669": {"name": "矿产", "parent": "526"},
+    "670": {"name": "采掘业", "parent": "526"},
+    "671": {"name": "冶炼", "parent": "526"},
+    "672": {"name": "环保", "parent": "526"},
+    "673": {"name": "无机化工原料", "parent": "527"},
+    "674": {"name": "有机化工原料", "parent": "527"},
+    "675": {"name": "精细化学品", "parent": "527"},
+    "676": {"name": "化工设备", "parent": "527"},
+    "677": {"name": "化工工程", "parent": "527"},
+    "678": {"name": "资产管理", "parent": "513"},
+    "679": {"name": "金融租赁", "parent": "513"},
+    "680": {"name": "征信及信评机构", "parent": "513"},
+    "681": {"name": "资产评估机构", "parent": "513"},
+    "683": {"name": "金融监管机构", "parent": "513"},
+    "684": {"name": "国际贸易", "parent": "521"},
+    "685": {"name": "海关", "parent": "521"},
+    "686": {"name": "购物中心", "parent": "536"},
+    "687": {"name": "超市", "parent": "536"},
+    "688": {"name": "便利店", "parent": "536"},
+    "689": {"name": "专卖店", "parent": "536"},
+    "690": {"name": "专业店", "parent": "536"},
+    "691": {"name": "百货店", "parent": "536"},
+    "692": {"name": "杂货店", "parent": "536"},
+    "693": {"name": "个人银行", "parent": "537"},
+    "695": {"name": "私人银行", "parent": "537"},
+    "696": {"name": "公司银行", "parent": "537"},
+    "697": {"name": "投资银行", "parent": "537"},
+    "698": {"name": "政策性银行", "parent": "537"},
+    "699": {"name": "中央银行", "parent": "537"},
+    "700": {"name": "人寿险", "parent": "538"},
+    "701": {"name": "财产险", "parent": "538"},
+    "702": {"name": "再保险", "parent": "538"},
+    "703": {"name": "养老险", "parent": "538"},
+    "704": {"name": "保险代理公司", "parent": "538"},
+    "705": {"name": "公募基金", "parent": "540"},
+    "707": {"name": "私募基金", "parent": "540"},
+    "708": {"name": "第三方理财", "parent": "679"},
+    "709": {"name": "资产管理公司", "parent": "679"},
+    "711": {"name": "房产中介", "parent": "566"},
+    "712": {"name": "职业中介", "parent": "566"},
+    "713": {"name": "婚姻中介", "parent": "566"},
+    "714": {"name": "战略咨询", "parent": "567"},
+    "715": {"name": "投资咨询", "parent": "567"},
+    "716": {"name": "心理咨询", "parent": "567"},
+    "717": {"name": "留学移民咨询", "parent": "567"},
+    "718": {"name": "工商注册代理", "parent": "568"},
+    "719": {"name": "商标专利代理", "parent": "568"},
+    "720": {"name": "财务代理", "parent": "568"},
+    "721": {"name": "工程机械", "parent": "620"},
+    "722": {"name": "农业机械", "parent": "620"},
+    "723": {"name": "海工设备", "parent": "620"},
+    "724": {"name": "包装机械", "parent": "620"},
+    "725": {"name": "印刷机械", "parent": "620"},
+    "726": {"name": "数控机床", "parent": "620"},
+    "727": {"name": "矿山机械", "parent": "620"},
+    "728": {"name": "水泵", "parent": "621"},
+    "729": {"name": "管道", "parent": "621"},
+    "730": {"name": "阀门", "parent": "621"},
+    "732": {"name": "压缩机", "parent": "621"},
+    "733": {"name": "集散控制系统", "parent": "622"},
+    "734": {"name": "远程控制", "parent": "622"},
+    "735": {"name": "液压系统", "parent": "622"},
+    "736": {"name": "楼宇智能化", "parent": "622"},
+    "737": {"name": "飞机制造", "parent": "624"},
+    "738": {"name": "航空公司", "parent": "624"},
+    "739": {"name": "发动机", "parent": "624"},
+    "740": {"name": "复合材料", "parent": "624"},
+    "741": {"name": "高铁", "parent": "625"},
+    "742": {"name": "地铁", "parent": "625"},
+    "743": {"name": "信号传输", "parent": "625"},
+    "745": {"name": "结构材料", "parent": "627"},
+    "746": {"name": "装饰材料", "parent": "627"},
+    "747": {"name": "专用材料", "parent": "627"},
+    "749": {"name": "经销商集团", "parent": "629"},
+    "750": {"name": "整车制造", "parent": "629"},
+    "751": {"name": "汽车零配件", "parent": "629"},
+    "752": {"name": "外型设计", "parent": "629"},
+    "753": {"name": "平版印刷", "parent": "630"},
+    "754": {"name": "凸版印刷", "parent": "630"},
+    "755": {"name": "凹版印刷", "parent": "630"},
+    "756": {"name": "孔版印刷", "parent": "630"},
+    "757": {"name": "印刷用纸", "parent": "631"},
+    "758": {"name": "书写、制图及复制用纸", "parent": "631"},
+    "759": {"name": "包装用纸", "parent": "631"},
+    "760": {"name": "生活、卫生及装饰用纸", "parent": "631"},
+    "761": {"name": "技术用纸", "parent": "631"},
+    "762": {"name": "加工纸原纸", "parent": "631"},
+    "763": {"name": "食品包装", "parent": "632"},
+    "764": {"name": "医药包装", "parent": "632"},
+    "765": {"name": "日化包装", "parent": "632"},
+    "766": {"name": "物流包装", "parent": "632"},
+    "767": {"name": "礼品包装", "parent": "632"},
+    "768": {"name": "电子五金包装", "parent": "632"},
+    "769": {"name": "汽车服务", "parent": "525"},
+    "770": {"name": "汽车保养", "parent": "769"},
+    "771": {"name": "租车", "parent": "769"},
+    "773": {"name": "出租车", "parent": "769"},
+    "774": {"name": "代驾", "parent": "769"},
+    "775": {"name": "发电", "parent": "666"},
+    "777": {"name": "输配电", "parent": "666"},
+    "779": {"name": "风电", "parent": "667"},
+    "780": {"name": "光伏/太阳能", "parent": "667"},
+    "781": {"name": "生物质发电", "parent": "667"},
+    "782": {"name": "煤化工", "parent": "667"},
+    "783": {"name": "垃圾发电", "parent": "667"},
+    "784": {"name": "核电", "parent": "667"},
+    "785": {"name": "能源矿产", "parent": "669"},
+    "786": {"name": "金属矿产", "parent": "669"},
+    "787": {"name": "非金属矿产", "parent": "669"},
+    "788": {"name": "水气矿产", "parent": "669"},
+    "789": {"name": "锅炉", "parent": "775"},
+    "790": {"name": "发电机", "parent": "775"},
+    "791": {"name": "汽轮机", "parent": "775"},
+    "792": {"name": "燃机", "parent": "775"},
+    "793": {"name": "冷却", "parent": "775"},
+    "794": {"name": "电力设计院", "parent": "775"},
+    "795": {"name": "高压输配电", "parent": "777"},
+    "796": {"name": "中压输配电", "parent": "777"},
+    "797": {"name": "低压输配电", "parent": "777"},
+    "798": {"name": "继电保护", "parent": "777"},
+    "799": {"name": "智能电网", "parent": "777"},
+    "800": {"name": "小学", "parent": "516"},
+    "801": {"name": "电动车", "parent": "519"},
+    "802": {"name": "皮具箱包", "parent": "518"},
+    "803": {"name": "医药制造", "parent": "522"},
+    "804": {"name": "电器销售", "parent": "536"},
+    "805": {"name": "塑料制品", "parent": "527"},
+    "806": {"name": "公益基金会", "parent": "530"},
+    "807": {"name": "美发服务", "parent": "525"},
+    "808": {"name": "农业养殖", "parent": "531"},
+    "809": {"name": "金融服务", "parent": "513"},
+    "810": {"name": "商业地产综合体", "parent": "514"},
+    "811": {"name": "美容服务", "parent": "525"},
+    "812": {"name": "灯饰", "parent": "518"},
+    "813": {"name": "油墨颜料产品", "parent": "527"},
+    "814": {"name": "眼镜制造", "parent": "518"},
+    "815": {"name": "农业生物技术", "parent": "531"},
+    "816": {"name": "体育用品", "parent": "518"},
+    "817": {"name": "保健用品", "parent": "524"},
+    "818": {"name": "化学化工产品", "parent": "527"},
+    "819": {"name": "饲料", "parent": "531"},
+    "821": {"name": "保安服务", "parent": "525"},
+    "822": {"name": "干细胞技术", "parent": "522"},
+    "824": {"name": "农药化肥", "parent": "527"},
+    "825": {"name": "卫生洁具", "parent": "518"},
+    "826": {"name": "体育器材、场馆", "parent": "518"},
+    "827": {"name": "饲料加工", "parent": "531"},
+    "828": {"name": "测绘服务", "parent": "529"},
+    "830": {"name": "金属船舶制造", "parent": "519"},
+    "831": {"name": "基因工程", "parent": "522"},
+    "832": {"name": "花卉服务", "parent": "536"},
+    "833": {"name": "农业种植", "parent": "531"},
+    "834": {"name": "皮革制品", "parent": "518"},
+    "835": {"name": "地理信息加工服务", "parent": "529"},
+    "836": {"name": "机器人", "parent": "519"},
+    "837": {"name": "礼品", "parent": "518"},
+    "838": {"name": "理发及美容服务", "parent": "525"},
+    "839": {"name": "其他清洁服务", "parent": "525"},
+    "840": {"name": "硅胶材料", "parent": "527"},
+    "841": {"name": "茶叶销售", "parent": "518"},
+    "842": {"name": "彩票活动", "parent": "529"},
+    "843": {"name": "化妆培训", "parent": "516"},
+    "844": {"name": "鞋业", "parent": "518"},
+    "845": {"name": "酒店用品", "parent": "518"},
+    "846": {"name": "复合材料", "parent": "527"},
+    "847": {"name": "房地产工程建设", "parent": "548"},
+    "848": {"name": "知识产权服务", "parent": "559"},
+    "849": {"name": "新型建材", "parent": "627"},
+    "850": {"name": "企业投资咨询", "parent": "567"},
+    "851": {"name": "含乳饮料和植物蛋白饮料制造", "parent": "594"},
+    "852": {"name": "汽车检测设备", "parent": "629"},
+    "853": {"name": "手机通讯器材", "parent": "417"},
+    "854": {"name": "环保材料", "parent": "672"},
+    "855": {"name": "交通设施", "parent": "554"},
+    "856": {"name": "电子器件", "parent": "419"},
+    "857": {"name": "啤酒", "parent": "594"},
+    "858": {"name": "生态旅游", "parent": "657"},
+    "859": {"name": "自动化设备", "parent": "626"},
+    "860": {"name": "软件开发", "parent": "414"},
+    "861": {"name": "葡萄酒销售", "parent": "594"},
+    "862": {"name": "钢材", "parent": "633"},
+    "863": {"name": "餐饮培训", "parent": "656"},
+    "864": {"name": "速冻食品", "parent": "593"},
+    "865": {"name": "空气环保", "parent": "672"},
+    "866": {"name": "互联网房地产经纪服务", "parent": "550"},
+    "867": {"name": "食品添加剂", "parent": "593"},
+    "868": {"name": "演艺传播", "parent": "585"},
+    "869": {"name": "信用卡", "parent": "537"},
+    "870": {"name": "报纸期刊广告", "parent": "579"},
+    "871": {"name": "摄影", "parent": "525"},
+    "872": {"name": "手机软件", "parent": "414"},
+    "873": {"name": "地坪建材", "parent": "627"},
+    "874": {"name": "企业管理咨询", "parent": "567"},
+    "875": {"name": "幼儿教育", "parent": "570"},
+    "876": {"name": "系统集成", "parent": "416"},
+    "877": {"name": "皮革服饰", "parent": "597"},
+    "878": {"name": "保健食品", "parent": "593"},
+    "879": {"name": "叉车", "parent": "620"},
+    "880": {"name": "厨卫电器", "parent": "601"},
+    "882": {"name": "地暖设备", "parent": "627"},
+    "883": {"name": "钢结构制造", "parent": "548"},
+    "884": {"name": "投影机", "parent": "606"},
+    "885": {"name": "啤酒销售", "parent": "594"},
+    "886": {"name": "度假村旅游", "parent": "657"},
+    "887": {"name": "电力元件设备", "parent": "626"},
+    "888": {"name": "管理软件", "parent": "414"},
+    "889": {"name": "轴承", "parent": "628"},
+    "890": {"name": "餐饮设备", "parent": "656"},
+    "891": {"name": "肉制品及副产品加工", "parent": "593"},
+    "892": {"name": "艺术收藏品投资交易", "parent": "584"},
+    "893": {"name": "净水器", "parent": "601"},
+    "894": {"name": "进口食品", "parent": "593"},
+    "895": {"name": "娱乐文化传播", "parent": "585"},
+    "896": {"name": "文化传播", "parent": "585"},
+    "897": {"name": "商旅传媒", "parent": "580"},
+    "898": {"name": "广告设计制作", "parent": "579"},
+    "899": {"name": "金属丝绳及其制品制造", "parent": "627"},
+    "900": {"name": "建筑涂料", "parent": "627"},
+    "901": {"name": "抵押贷款", "parent": "543"},
+    "902": {"name": "早教", "parent": "570"},
+    "903": {"name": "电影放映", "parent": "583"},
+    "904": {"name": "内衣服饰", "parent": "597"},
+    "905": {"name": "无线网络通信", "parent": "418"},
+    "906": {"name": "记忆卡", "parent": "415"},
+    "907": {"name": "女装服饰", "parent": "597"},
+    "908": {"name": "建筑机械", "parent": "620"},
+    "909": {"name": "制冷电器", "parent": "601"},
+    "910": {"name": "通信设备", "parent": "417"},
+    "911": {"name": "空调设备", "parent": "601"},
+    "912": {"name": "建筑装饰", "parent": "553"},
+    "913": {"name": "办公设备", "parent": "603"},
+    "916": {"name": "数据处理软件", "parent": "414"},
+    "917": {"name": "葡萄酒贸易", "parent": "594"},
+    "918": {"name": "通讯器材", "parent": "417"},
+    "919": {"name": "铜业", "parent": "633"},
+    "920": {"name": "食堂", "parent": "656"},
+    "921": {"name": "糖果零食", "parent": "593"},
+    "922": {"name": "文化艺术传播", "parent": "584"},
+    "923": {"name": "太阳能电器", "parent": "601"},
+    "924": {"name": "药品零售", "parent": "645"},
+    "925": {"name": "果蔬食品", "parent": "593"},
+    "926": {"name": "文化活动策划", "parent": "585"},
+    "928": {"name": "汽车广告", "parent": "657"},
+    "929": {"name": "条码设备", "parent": "630"},
+    "930": {"name": "建筑石材", "parent": "627"},
+    "931": {"name": "贵金属", "parent": "545"},
+    "932": {"name": "体育", "parent": "660"},
+    "933": {"name": "金融信息服务", "parent": "414"},
+    "934": {"name": "玻璃建材", "parent": "627"},
+    "935": {"name": "家教", "parent": "569"},
+    "936": {"name": "歌舞厅娱乐活动", "parent": "586"},
+    "937": {"name": "计算机服务器", "parent": "415"},
+    "938": {"name": "管道", "parent": "627"},
+    "939": {"name": "婴幼儿服饰", "parent": "597"},
+    "940": {"name": "热水器", "parent": "601"},
+    "941": {"name": "计算机及零部件制造", "parent": "415"},
+    "942": {"name": "钢铁贸易", "parent": "633"},
+    "944": {"name": "包装材料", "parent": "632"},
+    "945": {"name": "计算机办公设备", "parent": "603"},
+    "946": {"name": "白酒", "parent": "594"},
+    "948": {"name": "发动机", "parent": "620"},
+    "949": {"name": "快餐服务", "parent": "656"},
+    "950": {"name": "酒类销售", "parent": "594"},
+    "951": {"name": "电子产品、机电设备", "parent": "626"},
+    "952": {"name": "激光设备", "parent": "626"},
+    "953": {"name": "餐饮策划", "parent": "656"},
+    "954": {"name": "饮料、食品", "parent": "594"},
+    "955": {"name": "文化娱乐经纪", "parent": "585"},
+    "956": {"name": "天然气", "parent": "665"},
+    "957": {"name": "农副食品", "parent": "593"},
+    "958": {"name": "艺术表演", "parent": "585"},
+    "959": {"name": "石膏、水泥制品及类似制品制造", "parent": "627"},
+    "960": {"name": "橱柜", "parent": "602"},
+    "961": {"name": "管理培训", "parent": "577"},
+    "962": {"name": "男装服饰", "parent": "597"},
+    "963": {"name": "化肥制造", "parent": "675"},
+    "964": {"name": "童装服饰", "parent": "597"},
+    "965": {"name": "电源电池", "parent": "626"},
+    "966": {"name": "家电维修", "parent": "664"},
+    "967": {"name": "光电子器件", "parent": "419"},
+    "968": {"name": "旅行社服务", "parent": "657"},
+    "969": {"name": "电线、电缆制造", "parent": "626"},
+    "970": {"name": "软件开发、信息系统集成", "parent": "419"},
+    "971": {"name": "白酒制造", "parent": "594"},
+    "973": {"name": "甜品服务", "parent": "656"},
+    "974": {"name": "糕点、面包制造", "parent": "593"},
+    "975": {"name": "木工机械", "parent": "620"},
+    "976": {"name": "酒吧服务", "parent": "656"},
+    "977": {"name": "火腿肠", "parent": "593"},
+    "978": {"name": "广告策划推广", "parent": "579"},
+    "979": {"name": "新能源产品和生产装备制造", "parent": "667"},
+    "980": {"name": "调味品", "parent": "593"},
+    "981": {"name": "礼仪表演", "parent": "585"},
+    "982": {"name": "劳务派遣", "parent": "560"},
+    "983": {"name": "建材零售", "parent": "627"},
+    "984": {"name": "商品交易中心", "parent": "545"},
+    "985": {"name": "体育推广", "parent": "585"},
+    "986": {"name": "茶饮料及其他饮料制造", "parent": "594"},
+    "987": {"name": "金属建材", "parent": "627"},
+    "988": {"name": "职业技能培训", "parent": "571"},
+    "989": {"name": "网吧活动", "parent": "586"},
+    "990": {"name": "洗衣服务", "parent": "658"},
+    "991": {"name": "管道工程", "parent": "554"},
+    "992": {"name": "通信工程", "parent": "417"},
+    "993": {"name": "电子元器件", "parent": "626"},
+    "994": {"name": "电子设备", "parent": "419"},
+    "995": {"name": "茶馆服务", "parent": "656"},
+    "996": {"name": "旅游开发", "parent": "657"},
+    "997": {"name": "视频通讯", "parent": "417"},
+    "998": {"name": "白酒销售", "parent": "594"},
+    "1000": {"name": "咖啡馆服务", "parent": "656"},
+    "1001": {"name": "食品零售", "parent": "593"},
+    "1002": {"name": "健康疗养旅游", "parent": "655"},
+    "1003": {"name": "粮油食品", "parent": "593"},
+    "1004": {"name": "儿童教育影视", "parent": "583"},
+    "1005": {"name": "新能源发电", "parent": "667"},
+    "1006": {"name": "旅游策划", "parent": "657"},
+    "1007": {"name": "绘画", "parent": "575"},
+    "1008": {"name": "方便面及其他方便食品", "parent": "593"},
+    "1009": {"name": "房地产经纪", "parent": "550"},
+    "1010": {"name": "母婴家政", "parent": "661"},
+    "1011": {"name": "居家养老健康服务", "parent": "661"},
+    "1012": {"name": "文化艺术投资", "parent": "545"},
+    "1013": {"name": "运动健身", "parent": "660"},
+    "1014": {"name": "瓶（罐）装饮用水制造", "parent": "594"},
+    "1015": {"name": "金属门窗", "parent": "627"},
+    "1016": {"name": "机动车检测", "parent": "563"},
+    "1017": {"name": "货物运输", "parent": "634"},
+    "1018": {"name": "服饰专卖", "parent": "690"},
+    "1019": {"name": "酒店服装", "parent": "597"},
+    "1020": {"name": "通讯软件", "parent": "417"},
+    "1021": {"name": "消防工程", "parent": "554"},
+    "1022": {"name": "嵌入式电子系统", "parent": "419"},
+    "1023": {"name": "航空票务", "parent": "636"},
+    "1024": {"name": "电气设备", "parent": "626"},
+    "1025": {"name": "酒业贸易", "parent": "594"},
+    "1027": {"name": "其他饮料及冷饮服务", "parent": "656"},
+    "1028": {"name": "乳制品", "parent": "593"},
+    "1029": {"name": "新闻期刊出版", "parent": "588"},
+    "1030": {"name": "水污染治理", "parent": "672"},
+    "1031": {"name": "谷物食品", "parent": "593"},
+    "1032": {"name": "数字动漫设计制造服务", "parent": "590"},
+    "1033": {"name": "医院", "parent": "646"},
+    "1034": {"name": "旅游广告", "parent": "657"},
+    "1035": {"name": "办公家具", "parent": "602"},
+    "1036": {"name": "房地产营销策划", "parent": "550"},
+    "1037": {"name": "保洁家政", "parent": "661"},
+    "1038": {"name": "水泥制造", "parent": "627"},
+    "1039": {"name": "市场研究咨询", "parent": "567"},
+    "1040": {"name": "驾校", "parent": "571"},
+    "1041": {"name": "正餐服务", "parent": "656"},
+    "1043": {"name": "机动车燃油", "parent": "665"},
+    "1044": {"name": "食品", "parent": "593"},
+    "1045": {"name": "新能源汽车", "parent": "629"},
+    "1046": {"name": "手机无线网络推广", "parent": "417"},
+    "1047": {"name": "环保设备", "parent": "672"},
+    "1048": {"name": "通讯工程", "parent": "418"},
+    "1049": {"name": "半导体集成电路", "parent": "419"},
+    "1050": {"name": "航空服务", "parent": "636"},
+    "1051": {"name": "电机设备", "parent": "626"},
+    "1052": {"name": "档案软件", "parent": "414"},
+    "1053": {"name": "冷链物流服务", "parent": "634"},
+    "1054": {"name": "小吃服务", "parent": "656"},
+    "1055": {"name": "水产品加工", "parent": "593"},
+    "1056": {"name": "图书出版", "parent": "588"},
+    "1057": {"name": "固体废物治理", "parent": "672"},
+    "1059": {"name": "坚果食品", "parent": "593"},
+    "1060": {"name": "广告传媒", "parent": "579"},
+    "1061": {"name": "电梯", "parent": "622"},
+    "1062": {"name": "社区医疗与卫生院", "parent": "646"},
+    "1063": {"name": "广告、印刷包装", "parent": "630"},
+    "1064": {"name": "婚纱礼服", "parent": "662"},
+    "1065": {"name": "地毯", "parent": "602"},
+    "1066": {"name": "互联网物业", "parent": "551"},
+    "1067": {"name": "跨境电商", "parent": "3"},
+    "1068": {"name": "信息安全、系统集成", "parent": "9"},
+    "1069": {"name": "专用汽车制造", "parent": "750"},
+    "1070": {"name": "商品贸易", "parent": "3"},
+    "1071": {"name": "墙壁装饰材料", "parent": "746"},
+    "1072": {"name": "窗帘装饰材料", "parent": "746"},
+    "1073": {"name": "电子商务、本地生活服务", "parent": "3"},
+    "1075": {"name": "白酒电子商务", "parent": "3"},
+    "1076": {"name": "商品贸易、电子商务", "parent": "3"},
+    "1077": {"name": "木质装饰材料", "parent": "746"},
+    "1078": {"name": "电子商务、汽车电商交易平台", "parent": "3"},
+    "1079": {"name": "汽车轮胎", "parent": "751"},
+    "1080": {"name": "气体压缩机械制造", "parent": "732"},
+    "1081": {"name": "家装家具电子商务", "parent": "3"},
+    "1082": {"name": "化妆品电子商务", "parent": "3"},
+    "1083": {"name": "汽车销售", "parent": "749"},
+    "1084": {"name": "新闻资讯网站", "parent": "510"},
+    "1085": {"name": "母婴电商", "parent": "3"},
+    "1086": {"name": "电商商务、收藏品交易", "parent": "3"},
+    "1088": {"name": "电子商务、数码产品", "parent": "3"},
+    "1089": {"name": "二手车交易", "parent": "749"},
+    "1090": {"name": "游戏制作服务", "parent": "5"},
+    "1091": {"name": "母婴服务", "parent": "510"},
+    "1092": {"name": "家具电子商务", "parent": "3"},
+    "1093": {"name": "汽车配件电子商务", "parent": "3"},
+    "1094": {"name": "输配电设备", "parent": "777"},
+    "1095": {"name": "矿山设备", "parent": "727"},
+    "1096": {"name": "机床机械", "parent": "726"},
+    "1097": {"name": "农产品电商", "parent": "3"},
+    "1098": {"name": "陶瓷装饰材料", "parent": "746"},
+    "1099": {"name": "车载联网设备", "parent": "487"},
+    "1100": {"name": "汽车销售电子商务", "parent": "3"},
+    "1101": {"name": "石油设备", "parent": "730"},
+    "1102": {"name": "智能家居", "parent": "487"},
+    "1103": {"name": "散热器", "parent": "751"},
+    "1104": {"name": "电力工程", "parent": "775"},
+    "1105": {"name": "生鲜电商", "parent": "3"},
+    "1106": {"name": "互联网数据服务", "parent": "490"},
+    "1107": {"name": "房车、商务车销售", "parent": "749"},
+    "1108": {"name": "茶叶电子商务", "parent": "3"},
+    "1109": {"name": "酒类电子商务", "parent": "3"},
+    "1110": {"name": "阀门", "parent": "730"},
+    "1111": {"name": "食品电商", "parent": "3"},
+    "1112": {"name": "儿童摄影", "parent": "871"},
+    "1113": {"name": "广告摄影", "parent": "871"},
+    "1114": {"name": "婚纱摄影", "parent": "871"},
+    "1115": {"name": "模具制造", "parent": "620"},
+    "1116": {"name": "汽车模具", "parent": "629"},
+    "1117": {"name": "认证咨询", "parent": "567"},
+    "1118": {"name": "数字视觉制作服务", "parent": "590"},
+    "1119": {"name": "牙科及医疗器械", "parent": "646"},
+    "1120": {"name": "猎头招聘", "parent": "560"},
+    "1121": {"name": "家居", "parent": "518"},
+    "1122": {"name": "收藏品", "parent": "518"},
+    "1123": {"name": "首饰", "parent": "518"},
+    "1124": {"name": "工艺品", "parent": "518"},
+    "1125": {"name": "财务", "parent": "515"},
+    "1126": {"name": "税务", "parent": "515"},
+    "1127": {"name": "分类信息", "parent": "2"},
+    "1128": {"name": "宠物", "parent": "0"},
+    "1129": {"name": "快消品", "parent": "518"},
+    "1130": {"name": "人工智能", "parent": "2"},
+    "1131": {"name": "农/林/牧/渔", "parent": "0"},
+}
+
+
+def get_names(id):
+    id = str(id)
+    nms = []
+    d = TBL.get(id)
+    if not d:
+        return []
+    nms.append(d["name"])
+    p = get_names(d["parent"])
+    if p:
+        nms.extend(p)
+    return nms
+
+
+if __name__ == "__main__":
+    print(get_names("1119"))
--- a/deepdoc/parser/resume/entities/regions.py
+++ b/deepdoc/parser/resume/entities/regions.py
@@ -0,0 +1,789 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import re
+
+TBL = {
+    "2": {"name": "北京", "parent": "1"},
+    "3": {"name": "天津", "parent": "1"},
+    "4": {"name": "河北", "parent": "1"},
+    "5": {"name": "山西", "parent": "1"},
+    "6": {"name": "内蒙古", "parent": "1"},
+    "7": {"name": "辽宁", "parent": "1"},
+    "8": {"name": "吉林", "parent": "1"},
+    "9": {"name": "黑龙江", "parent": "1"},
+    "10": {"name": "上海", "parent": "1"},
+    "11": {"name": "江苏", "parent": "1"},
+    "12": {"name": "浙江", "parent": "1"},
+    "13": {"name": "安徽", "parent": "1"},
+    "14": {"name": "福建", "parent": "1"},
+    "15": {"name": "江西", "parent": "1"},
+    "16": {"name": "山东", "parent": "1"},
+    "17": {"name": "河南", "parent": "1"},
+    "18": {"name": "湖北", "parent": "1"},
+    "19": {"name": "湖南", "parent": "1"},
+    "20": {"name": "广东", "parent": "1"},
+    "21": {"name": "广西", "parent": "1"},
+    "22": {"name": "海南", "parent": "1"},
+    "23": {"name": "重庆", "parent": "1"},
+    "24": {"name": "四川", "parent": "1"},
+    "25": {"name": "贵州", "parent": "1"},
+    "26": {"name": "云南", "parent": "1"},
+    "27": {"name": "西藏", "parent": "1"},
+    "28": {"name": "陕西", "parent": "1"},
+    "29": {"name": "甘肃", "parent": "1"},
+    "30": {"name": "青海", "parent": "1"},
+    "31": {"name": "宁夏", "parent": "1"},
+    "32": {"name": "新疆", "parent": "1"},
+    "33": {"name": "北京市", "parent": "2"},
+    "34": {"name": "天津市", "parent": "3"},
+    "35": {"name": "石家庄市", "parent": "4"},
+    "36": {"name": "唐山市", "parent": "4"},
+    "37": {"name": "秦皇岛市", "parent": "4"},
+    "38": {"name": "邯郸市", "parent": "4"},
+    "39": {"name": "邢台市", "parent": "4"},
+    "40": {"name": "保定市", "parent": "4"},
+    "41": {"name": "张家口市", "parent": "4"},
+    "42": {"name": "承德市", "parent": "4"},
+    "43": {"name": "沧州市", "parent": "4"},
+    "44": {"name": "廊坊市", "parent": "4"},
+    "45": {"name": "衡水市", "parent": "4"},
+    "46": {"name": "太原市", "parent": "5"},
+    "47": {"name": "大同市", "parent": "5"},
+    "48": {"name": "阳泉市", "parent": "5"},
+    "49": {"name": "长治市", "parent": "5"},
+    "50": {"name": "晋城市", "parent": "5"},
+    "51": {"name": "朔州市", "parent": "5"},
+    "52": {"name": "晋中市", "parent": "5"},
+    "53": {"name": "运城市", "parent": "5"},
+    "54": {"name": "忻州市", "parent": "5"},
+    "55": {"name": "临汾市", "parent": "5"},
+    "56": {"name": "吕梁市", "parent": "5"},
+    "57": {"name": "呼和浩特市", "parent": "6"},
+    "58": {"name": "包头市", "parent": "6"},
+    "59": {"name": "乌海市", "parent": "6"},
+    "60": {"name": "赤峰市", "parent": "6"},
+    "61": {"name": "通辽市", "parent": "6"},
+    "62": {"name": "鄂尔多斯市", "parent": "6"},
+    "63": {"name": "呼伦贝尔市", "parent": "6"},
+    "64": {"name": "巴彦淖尔市", "parent": "6"},
+    "65": {"name": "乌兰察布市", "parent": "6"},
+    "66": {"name": "兴安盟", "parent": "6"},
+    "67": {"name": "锡林郭勒盟", "parent": "6"},
+    "68": {"name": "阿拉善盟", "parent": "6"},
+    "69": {"name": "沈阳市", "parent": "7"},
+    "70": {"name": "大连市", "parent": "7"},
+    "71": {"name": "鞍山市", "parent": "7"},
+    "72": {"name": "抚顺市", "parent": "7"},
+    "73": {"name": "本溪市", "parent": "7"},
+    "74": {"name": "丹东市", "parent": "7"},
+    "75": {"name": "锦州市", "parent": "7"},
+    "76": {"name": "营口市", "parent": "7"},
+    "77": {"name": "阜新市", "parent": "7"},
+    "78": {"name": "辽阳市", "parent": "7"},
+    "79": {"name": "盘锦市", "parent": "7"},
+    "80": {"name": "铁岭市", "parent": "7"},
+    "81": {"name": "朝阳市", "parent": "7"},
+    "82": {"name": "葫芦岛市", "parent": "7"},
+    "83": {"name": "长春市", "parent": "8"},
+    "84": {"name": "吉林市", "parent": "8"},
+    "85": {"name": "四平市", "parent": "8"},
+    "86": {"name": "辽源市", "parent": "8"},
+    "87": {"name": "通化市", "parent": "8"},
+    "88": {"name": "白山市", "parent": "8"},
+    "89": {"name": "松原市", "parent": "8"},
+    "90": {"name": "白城市", "parent": "8"},
+    "91": {"name": "延边朝鲜族自治州", "parent": "8"},
+    "92": {"name": "哈尔滨市", "parent": "9"},
+    "93": {"name": "齐齐哈尔市", "parent": "9"},
+    "94": {"name": "鸡西市", "parent": "9"},
+    "95": {"name": "鹤岗市", "parent": "9"},
+    "96": {"name": "双鸭山市", "parent": "9"},
+    "97": {"name": "大庆市", "parent": "9"},
+    "98": {"name": "伊春市", "parent": "9"},
+    "99": {"name": "佳木斯市", "parent": "9"},
+    "100": {"name": "七台河市", "parent": "9"},
+    "101": {"name": "牡丹江市", "parent": "9"},
+    "102": {"name": "黑河市", "parent": "9"},
+    "103": {"name": "绥化市", "parent": "9"},
+    "104": {"name": "大兴安岭地区", "parent": "9"},
+    "105": {"name": "上海市", "parent": "10"},
+    "106": {"name": "南京市", "parent": "11"},
+    "107": {"name": "无锡市", "parent": "11"},
+    "108": {"name": "徐州市", "parent": "11"},
+    "109": {"name": "常州市", "parent": "11"},
+    "110": {"name": "苏州市", "parent": "11"},
+    "111": {"name": "南通市", "parent": "11"},
+    "112": {"name": "连云港市", "parent": "11"},
+    "113": {"name": "淮安市", "parent": "11"},
+    "114": {"name": "盐城市", "parent": "11"},
+    "115": {"name": "扬州市", "parent": "11"},
+    "116": {"name": "镇江市", "parent": "11"},
+    "117": {"name": "泰州市", "parent": "11"},
+    "118": {"name": "宿迁市", "parent": "11"},
+    "119": {"name": "杭州市", "parent": "12"},
+    "120": {"name": "宁波市", "parent": "12"},
+    "121": {"name": "温州市", "parent": "12"},
+    "122": {"name": "嘉兴市", "parent": "12"},
+    "123": {"name": "湖州市", "parent": "12"},
+    "124": {"name": "绍兴市", "parent": "12"},
+    "125": {"name": "金华市", "parent": "12"},
+    "126": {"name": "衢州市", "parent": "12"},
+    "127": {"name": "舟山市", "parent": "12"},
+    "128": {"name": "台州市", "parent": "12"},
+    "129": {"name": "丽水市", "parent": "12"},
+    "130": {"name": "合肥市", "parent": "13"},
+    "131": {"name": "芜湖市", "parent": "13"},
+    "132": {"name": "蚌埠市", "parent": "13"},
+    "133": {"name": "淮南市", "parent": "13"},
+    "134": {"name": "马鞍山市", "parent": "13"},
+    "135": {"name": "淮北市", "parent": "13"},
+    "136": {"name": "铜陵市", "parent": "13"},
+    "137": {"name": "安庆市", "parent": "13"},
+    "138": {"name": "黄山市", "parent": "13"},
+    "139": {"name": "滁州市", "parent": "13"},
+    "140": {"name": "阜阳市", "parent": "13"},
+    "141": {"name": "宿州市", "parent": "13"},
+    "143": {"name": "六安市", "parent": "13"},
+    "144": {"name": "亳州市", "parent": "13"},
+    "145": {"name": "池州市", "parent": "13"},
+    "146": {"name": "宣城市", "parent": "13"},
+    "147": {"name": "福州市", "parent": "14"},
+    "148": {"name": "厦门市", "parent": "14"},
+    "149": {"name": "莆田市", "parent": "14"},
+    "150": {"name": "三明市", "parent": "14"},
+    "151": {"name": "泉州市", "parent": "14"},
+    "152": {"name": "漳州市", "parent": "14"},
+    "153": {"name": "南平市", "parent": "14"},
+    "154": {"name": "龙岩市", "parent": "14"},
+    "155": {"name": "宁德市", "parent": "14"},
+    "156": {"name": "南昌市", "parent": "15"},
+    "157": {"name": "景德镇市", "parent": "15"},
+    "158": {"name": "萍乡市", "parent": "15"},
+    "159": {"name": "九江市", "parent": "15"},
+    "160": {"name": "新余市", "parent": "15"},
+    "161": {"name": "鹰潭市", "parent": "15"},
+    "162": {"name": "赣州市", "parent": "15"},
+    "163": {"name": "吉安市", "parent": "15"},
+    "164": {"name": "宜春市", "parent": "15"},
+    "165": {"name": "抚州市", "parent": "15"},
+    "166": {"name": "上饶市", "parent": "15"},
+    "167": {"name": "济南市", "parent": "16"},
+    "168": {"name": "青岛市", "parent": "16"},
+    "169": {"name": "淄博市", "parent": "16"},
+    "170": {"name": "枣庄市", "parent": "16"},
+    "171": {"name": "东营市", "parent": "16"},
+    "172": {"name": "烟台市", "parent": "16"},
+    "173": {"name": "潍坊市", "parent": "16"},
+    "174": {"name": "济宁市", "parent": "16"},
+    "175": {"name": "泰安市", "parent": "16"},
+    "176": {"name": "威海市", "parent": "16"},
+    "177": {"name": "日照市", "parent": "16"},
+    "179": {"name": "临沂市", "parent": "16"},
+    "180": {"name": "德州市", "parent": "16"},
+    "181": {"name": "聊城市", "parent": "16"},
+    "182": {"name": "滨州市", "parent": "16"},
+    "183": {"name": "菏泽市", "parent": "16"},
+    "184": {"name": "郑州市", "parent": "17"},
+    "185": {"name": "开封市", "parent": "17"},
+    "186": {"name": "洛阳市", "parent": "17"},
+    "187": {"name": "平顶山市", "parent": "17"},
+    "188": {"name": "安阳市", "parent": "17"},
+    "189": {"name": "鹤壁市", "parent": "17"},
+    "190": {"name": "新乡市", "parent": "17"},
+    "191": {"name": "焦作市", "parent": "17"},
+    "192": {"name": "濮阳市", "parent": "17"},
+    "193": {"name": "许昌市", "parent": "17"},
+    "194": {"name": "漯河市", "parent": "17"},
+    "195": {"name": "三门峡市", "parent": "17"},
+    "196": {"name": "南阳市", "parent": "17"},
+    "197": {"name": "商丘市", "parent": "17"},
+    "198": {"name": "信阳市", "parent": "17"},
+    "199": {"name": "周口市", "parent": "17"},
+    "200": {"name": "驻马店市", "parent": "17"},
+    "201": {"name": "武汉市", "parent": "18"},
+    "202": {"name": "黄石市", "parent": "18"},
+    "203": {"name": "十堰市", "parent": "18"},
+    "204": {"name": "宜昌市", "parent": "18"},
+    "205": {"name": "襄阳市", "parent": "18"},
+    "206": {"name": "鄂州市", "parent": "18"},
+    "207": {"name": "荆门市", "parent": "18"},
+    "208": {"name": "孝感市", "parent": "18"},
+    "209": {"name": "荆州市", "parent": "18"},
+    "210": {"name": "黄冈市", "parent": "18"},
+    "211": {"name": "咸宁市", "parent": "18"},
+    "212": {"name": "随州市", "parent": "18"},
+    "213": {"name": "恩施土家族苗族自治州", "parent": "18"},
+    "215": {"name": "长沙市", "parent": "19"},
+    "216": {"name": "株洲市", "parent": "19"},
+    "217": {"name": "湘潭市", "parent": "19"},
+    "218": {"name": "衡阳市", "parent": "19"},
+    "219": {"name": "邵阳市", "parent": "19"},
+    "220": {"name": "岳阳市", "parent": "19"},
+    "221": {"name": "常德市", "parent": "19"},
+    "222": {"name": "张家界市", "parent": "19"},
+    "223": {"name": "益阳市", "parent": "19"},
+    "224": {"name": "郴州市", "parent": "19"},
+    "225": {"name": "永州市", "parent": "19"},
+    "226": {"name": "怀化市", "parent": "19"},
+    "227": {"name": "娄底市", "parent": "19"},
+    "228": {"name": "湘西土家族苗族自治州", "parent": "19"},
+    "229": {"name": "广州市", "parent": "20"},
+    "230": {"name": "韶关市", "parent": "20"},
+    "231": {"name": "深圳市", "parent": "20"},
+    "232": {"name": "珠海市", "parent": "20"},
+    "233": {"name": "汕头市", "parent": "20"},
+    "234": {"name": "佛山市", "parent": "20"},
+    "235": {"name": "江门市", "parent": "20"},
+    "236": {"name": "湛江市", "parent": "20"},
+    "237": {"name": "茂名市", "parent": "20"},
+    "238": {"name": "肇庆市", "parent": "20"},
+    "239": {"name": "惠州市", "parent": "20"},
+    "240": {"name": "梅州市", "parent": "20"},
+    "241": {"name": "汕尾市", "parent": "20"},
+    "242": {"name": "河源市", "parent": "20"},
+    "243": {"name": "阳江市", "parent": "20"},
+    "244": {"name": "清远市", "parent": "20"},
+    "245": {"name": "东莞市", "parent": "20"},
+    "246": {"name": "中山市", "parent": "20"},
+    "247": {"name": "潮州市", "parent": "20"},
+    "248": {"name": "揭阳市", "parent": "20"},
+    "249": {"name": "云浮市", "parent": "20"},
+    "250": {"name": "南宁市", "parent": "21"},
+    "251": {"name": "柳州市", "parent": "21"},
+    "252": {"name": "桂林市", "parent": "21"},
+    "253": {"name": "梧州市", "parent": "21"},
+    "254": {"name": "北海市", "parent": "21"},
+    "255": {"name": "防城港市", "parent": "21"},
+    "256": {"name": "钦州市", "parent": "21"},
+    "257": {"name": "贵港市", "parent": "21"},
+    "258": {"name": "玉林市", "parent": "21"},
+    "259": {"name": "百色市", "parent": "21"},
+    "260": {"name": "贺州市", "parent": "21"},
+    "261": {"name": "河池市", "parent": "21"},
+    "262": {"name": "来宾市", "parent": "21"},
+    "263": {"name": "崇左市", "parent": "21"},
+    "264": {"name": "海口市", "parent": "22"},
+    "265": {"name": "三亚市", "parent": "22"},
+    "267": {"name": "重庆市", "parent": "23"},
+    "268": {"name": "成都市", "parent": "24"},
+    "269": {"name": "自贡市", "parent": "24"},
+    "270": {"name": "攀枝花市", "parent": "24"},
+    "271": {"name": "泸州市", "parent": "24"},
+    "272": {"name": "德阳市", "parent": "24"},
+    "273": {"name": "绵阳市", "parent": "24"},
+    "274": {"name": "广元市", "parent": "24"},
+    "275": {"name": "遂宁市", "parent": "24"},
+    "276": {"name": "内江市", "parent": "24"},
+    "277": {"name": "乐山市", "parent": "24"},
+    "278": {"name": "南充市", "parent": "24"},
+    "279": {"name": "眉山市", "parent": "24"},
+    "280": {"name": "宜宾市", "parent": "24"},
+    "281": {"name": "广安市", "parent": "24"},
+    "282": {"name": "达州市", "parent": "24"},
+    "283": {"name": "雅安市", "parent": "24"},
+    "284": {"name": "巴中市", "parent": "24"},
+    "285": {"name": "资阳市", "parent": "24"},
+    "286": {"name": "阿坝藏族羌族自治州", "parent": "24"},
+    "287": {"name": "甘孜藏族自治州", "parent": "24"},
+    "288": {"name": "凉山彝族自治州", "parent": "24"},
+    "289": {"name": "贵阳市", "parent": "25"},
+    "290": {"name": "六盘水市", "parent": "25"},
+    "291": {"name": "遵义市", "parent": "25"},
+    "292": {"name": "安顺市", "parent": "25"},
+    "293": {"name": "铜仁市", "parent": "25"},
+    "294": {"name": "黔西南布依族苗族自治州", "parent": "25"},
+    "295": {"name": "毕节市", "parent": "25"},
+    "296": {"name": "黔东南苗族侗族自治州", "parent": "25"},
+    "297": {"name": "黔南布依族苗族自治州", "parent": "25"},
+    "298": {"name": "昆明市", "parent": "26"},
+    "299": {"name": "曲靖市", "parent": "26"},
+    "300": {"name": "玉溪市", "parent": "26"},
+    "301": {"name": "保山市", "parent": "26"},
+    "302": {"name": "昭通市", "parent": "26"},
+    "303": {"name": "丽江市", "parent": "26"},
+    "304": {"name": "普洱市", "parent": "26"},
+    "305": {"name": "临沧市", "parent": "26"},
+    "306": {"name": "楚雄彝族自治州", "parent": "26"},
+    "307": {"name": "红河哈尼族彝族自治州", "parent": "26"},
+    "308": {"name": "文山壮族苗族自治州", "parent": "26"},
+    "309": {"name": "西双版纳傣族自治州", "parent": "26"},
+    "310": {"name": "大理白族自治州", "parent": "26"},
+    "311": {"name": "德宏傣族景颇族自治州", "parent": "26"},
+    "312": {"name": "怒江傈僳族自治州", "parent": "26"},
+    "313": {"name": "迪庆藏族自治州", "parent": "26"},
+    "314": {"name": "拉萨市", "parent": "27"},
+    "315": {"name": "昌都市", "parent": "27"},
+    "316": {"name": "山南市", "parent": "27"},
+    "317": {"name": "日喀则市", "parent": "27"},
+    "318": {"name": "那曲市", "parent": "27"},
+    "319": {"name": "阿里地区", "parent": "27"},
+    "320": {"name": "林芝市", "parent": "27"},
+    "321": {"name": "西安市", "parent": "28"},
+    "322": {"name": "铜川市", "parent": "28"},
+    "323": {"name": "宝鸡市", "parent": "28"},
+    "324": {"name": "咸阳市", "parent": "28"},
+    "325": {"name": "渭南市", "parent": "28"},
+    "326": {"name": "延安市", "parent": "28"},
+    "327": {"name": "汉中市", "parent": "28"},
+    "328": {"name": "榆林市", "parent": "28"},
+    "329": {"name": "安康市", "parent": "28"},
+    "330": {"name": "商洛市", "parent": "28"},
+    "331": {"name": "兰州市", "parent": "29"},
+    "332": {"name": "嘉峪关市", "parent": "29"},
+    "333": {"name": "金昌市", "parent": "29"},
+    "334": {"name": "白银市", "parent": "29"},
+    "335": {"name": "天水市", "parent": "29"},
+    "336": {"name": "武威市", "parent": "29"},
+    "337": {"name": "张掖市", "parent": "29"},
+    "338": {"name": "平凉市", "parent": "29"},
+    "339": {"name": "酒泉市", "parent": "29"},
+    "340": {"name": "庆阳市", "parent": "29"},
+    "341": {"name": "定西市", "parent": "29"},
+    "342": {"name": "陇南市", "parent": "29"},
+    "343": {"name": "临夏回族自治州", "parent": "29"},
+    "344": {"name": "甘南藏族自治州", "parent": "29"},
+    "345": {"name": "西宁市", "parent": "30"},
+    "346": {"name": "海东市", "parent": "30"},
+    "347": {"name": "海北藏族自治州", "parent": "30"},
+    "348": {"name": "黄南藏族自治州", "parent": "30"},
+    "349": {"name": "海南藏族自治州", "parent": "30"},
+    "350": {"name": "果洛藏族自治州", "parent": "30"},
+    "351": {"name": "玉树藏族自治州", "parent": "30"},
+    "352": {"name": "海西蒙古族藏族自治州", "parent": "30"},
+    "353": {"name": "银川市", "parent": "31"},
+    "354": {"name": "石嘴山市", "parent": "31"},
+    "355": {"name": "吴忠市", "parent": "31"},
+    "356": {"name": "固原市", "parent": "31"},
+    "357": {"name": "中卫市", "parent": "31"},
+    "358": {"name": "乌鲁木齐市", "parent": "32"},
+    "359": {"name": "克拉玛依市", "parent": "32"},
+    "360": {"name": "吐鲁番市", "parent": "32"},
+    "361": {"name": "哈密市", "parent": "32"},
+    "362": {"name": "昌吉回族自治州", "parent": "32"},
+    "363": {"name": "博尔塔拉蒙古自治州", "parent": "32"},
+    "364": {"name": "巴音郭楞蒙古自治州", "parent": "32"},
+    "365": {"name": "阿克苏地区", "parent": "32"},
+    "366": {"name": "克孜勒苏柯尔克孜自治州", "parent": "32"},
+    "367": {"name": "喀什地区", "parent": "32"},
+    "368": {"name": "和田地区", "parent": "32"},
+    "369": {"name": "伊犁哈萨克自治州", "parent": "32"},
+    "370": {"name": "塔城地区", "parent": "32"},
+    "371": {"name": "阿勒泰地区", "parent": "32"},
+    "372": {"name": "新疆省直辖行政单位", "parent": "32"},
+    "373": {"name": "可克达拉市", "parent": "32"},
+    "374": {"name": "昆玉市", "parent": "32"},
+    "375": {"name": "胡杨河市", "parent": "32"},
+    "376": {"name": "双河市", "parent": "32"},
+    "3560": {"name": "北票市", "parent": "7"},
+    "3615": {"name": "高州市", "parent": "20"},
+    "3651": {"name": "济源市", "parent": "17"},
+    "3662": {"name": "胶南市", "parent": "16"},
+    "3683": {"name": "老河口市", "parent": "18"},
+    "3758": {"name": "沙河市", "parent": "4"},
+    "3822": {"name": "宜城市", "parent": "18"},
+    "3842": {"name": "枣阳市", "parent": "18"},
+    "3850": {"name": "肇东市", "parent": "9"},
+    "3905": {"name": "澳门", "parent": "1"},
+    "3906": {"name": "澳门", "parent": "3905"},
+    "3907": {"name": "香港", "parent": "1"},
+    "3908": {"name": "香港", "parent": "3907"},
+    "3947": {"name": "仙桃市", "parent": "18"},
+    "3954": {"name": "台湾", "parent": "1"},
+    "3955": {"name": "台湾", "parent": "3954"},
+    "3956": {"name": "海外", "parent": "1"},
+    "3957": {"name": "海外", "parent": "3956"},
+    "3958": {"name": "美国", "parent": "3956"},
+    "3959": {"name": "加拿大", "parent": "3956"},
+    "3961": {"name": "日本", "parent": "3956"},
+    "3962": {"name": "韩国", "parent": "3956"},
+    "3963": {"name": "德国", "parent": "3956"},
+    "3964": {"name": "英国", "parent": "3956"},
+    "3965": {"name": "意大利", "parent": "3956"},
+    "3966": {"name": "西班牙", "parent": "3956"},
+    "3967": {"name": "法国", "parent": "3956"},
+    "3968": {"name": "澳大利亚", "parent": "3956"},
+    "3969": {"name": "东城区", "parent": "2"},
+    "3970": {"name": "西城区", "parent": "2"},
+    "3971": {"name": "崇文区", "parent": "2"},
+    "3972": {"name": "宣武区", "parent": "2"},
+    "3973": {"name": "朝阳区", "parent": "2"},
+    "3974": {"name": "海淀区", "parent": "2"},
+    "3975": {"name": "丰台区", "parent": "2"},
+    "3976": {"name": "石景山区", "parent": "2"},
+    "3977": {"name": "门头沟区", "parent": "2"},
+    "3978": {"name": "房山区", "parent": "2"},
+    "3979": {"name": "通州区", "parent": "2"},
+    "3980": {"name": "顺义区", "parent": "2"},
+    "3981": {"name": "昌平区", "parent": "2"},
+    "3982": {"name": "大兴区", "parent": "2"},
+    "3983": {"name": "平谷区", "parent": "2"},
+    "3984": {"name": "怀柔区", "parent": "2"},
+    "3985": {"name": "密云区", "parent": "2"},
+    "3986": {"name": "延庆区", "parent": "2"},
+    "3987": {"name": "黄浦区", "parent": "10"},
+    "3988": {"name": "徐汇区", "parent": "10"},
+    "3989": {"name": "长宁区", "parent": "10"},
+    "3990": {"name": "静安区", "parent": "10"},
+    "3991": {"name": "普陀区", "parent": "10"},
+    "3992": {"name": "闸北区", "parent": "10"},
+    "3993": {"name": "虹口区", "parent": "10"},
+    "3994": {"name": "杨浦区", "parent": "10"},
+    "3995": {"name": "宝山区", "parent": "10"},
+    "3996": {"name": "闵行区", "parent": "10"},
+    "3997": {"name": "嘉定区", "parent": "10"},
+    "3998": {"name": "浦东新区", "parent": "10"},
+    "3999": {"name": "松江区", "parent": "10"},
+    "4000": {"name": "金山区", "parent": "10"},
+    "4001": {"name": "青浦区", "parent": "10"},
+    "4002": {"name": "奉贤区", "parent": "10"},
+    "4003": {"name": "崇明区", "parent": "10"},
+    "4004": {"name": "和平区", "parent": "3"},
+    "4005": {"name": "河东区", "parent": "3"},
+    "4006": {"name": "河西区", "parent": "3"},
+    "4007": {"name": "南开区", "parent": "3"},
+    "4008": {"name": "红桥区", "parent": "3"},
+    "4009": {"name": "河北区", "parent": "3"},
+    "4010": {"name": "滨海新区", "parent": "3"},
+    "4011": {"name": "东丽区", "parent": "3"},
+    "4012": {"name": "西青区", "parent": "3"},
+    "4013": {"name": "北辰区", "parent": "3"},
+    "4014": {"name": "津南区", "parent": "3"},
+    "4015": {"name": "武清区", "parent": "3"},
+    "4016": {"name": "宝坻区", "parent": "3"},
+    "4017": {"name": "静海区", "parent": "3"},
+    "4018": {"name": "宁河区", "parent": "3"},
+    "4019": {"name": "蓟州区", "parent": "3"},
+    "4020": {"name": "渝中区", "parent": "23"},
+    "4021": {"name": "江北区", "parent": "23"},
+    "4022": {"name": "南岸区", "parent": "23"},
+    "4023": {"name": "沙坪坝区", "parent": "23"},
+    "4024": {"name": "九龙坡区", "parent": "23"},
+    "4025": {"name": "大渡口区", "parent": "23"},
+    "4026": {"name": "渝北区", "parent": "23"},
+    "4027": {"name": "巴南区", "parent": "23"},
+    "4028": {"name": "北碚区", "parent": "23"},
+    "4029": {"name": "万州区", "parent": "23"},
+    "4030": {"name": "黔江区", "parent": "23"},
+    "4031": {"name": "永川区", "parent": "23"},
+    "4032": {"name": "涪陵区", "parent": "23"},
+    "4033": {"name": "江津区", "parent": "23"},
+    "4034": {"name": "合川区", "parent": "23"},
+    "4035": {"name": "双桥区", "parent": "23"},
+    "4036": {"name": "万盛区", "parent": "23"},
+    "4037": {"name": "荣昌区", "parent": "23"},
+    "4038": {"name": "大足区", "parent": "23"},
+    "4039": {"name": "璧山区", "parent": "23"},
+    "4040": {"name": "铜梁区", "parent": "23"},
+    "4041": {"name": "潼南区", "parent": "23"},
+    "4042": {"name": "綦江区", "parent": "23"},
+    "4043": {"name": "忠县", "parent": "23"},
+    "4044": {"name": "开州区", "parent": "23"},
+    "4045": {"name": "云阳县", "parent": "23"},
+    "4046": {"name": "梁平区", "parent": "23"},
+    "4047": {"name": "垫江县", "parent": "23"},
+    "4048": {"name": "丰都县", "parent": "23"},
+    "4049": {"name": "奉节县", "parent": "23"},
+    "4050": {"name": "巫山县", "parent": "23"},
+    "4051": {"name": "巫溪县", "parent": "23"},
+    "4052": {"name": "城口县", "parent": "23"},
+    "4053": {"name": "武隆区", "parent": "23"},
+    "4054": {"name": "石柱土家族自治县", "parent": "23"},
+    "4055": {"name": "秀山土家族苗族自治县", "parent": "23"},
+    "4056": {"name": "酉阳土家族苗族自治县", "parent": "23"},
+    "4057": {"name": "彭水苗族土家族自治县", "parent": "23"},
+    "4058": {"name": "潜江市", "parent": "18"},
+    "4059": {"name": "三沙市", "parent": "22"},
+    "4060": {"name": "石河子市", "parent": "32"},
+    "4061": {"name": "阿拉尔市", "parent": "32"},
+    "4062": {"name": "图木舒克市", "parent": "32"},
+    "4063": {"name": "五家渠市", "parent": "32"},
+    "4064": {"name": "北屯市", "parent": "32"},
+    "4065": {"name": "铁门关市", "parent": "32"},
+    "4066": {"name": "儋州市", "parent": "22"},
+    "4067": {"name": "五指山市", "parent": "22"},
+    "4068": {"name": "文昌市", "parent": "22"},
+    "4069": {"name": "琼海市", "parent": "22"},
+    "4070": {"name": "万宁市", "parent": "22"},
+    "4072": {"name": "定安县", "parent": "22"},
+    "4073": {"name": "屯昌县", "parent": "22"},
+    "4074": {"name": "澄迈县", "parent": "22"},
+    "4075": {"name": "临高县", "parent": "22"},
+    "4076": {"name": "琼中黎族苗族自治县", "parent": "22"},
+    "4077": {"name": "保亭黎族苗族自治县", "parent": "22"},
+    "4078": {"name": "白沙黎族自治县", "parent": "22"},
+    "4079": {"name": "昌江黎族自治县", "parent": "22"},
+    "4080": {"name": "乐东黎族自治县", "parent": "22"},
+    "4081": {"name": "陵水黎族自治县", "parent": "22"},
+    "4082": {"name": "马来西亚", "parent": "3956"},
+    "6047": {"name": "长寿区", "parent": "23"},
+    "6857": {"name": "阿富汗", "parent": "3956"},
+    "6858": {"name": "阿尔巴尼亚", "parent": "3956"},
+    "6859": {"name": "阿尔及利亚", "parent": "3956"},
+    "6860": {"name": "美属萨摩亚", "parent": "3956"},
+    "6861": {"name": "安道尔", "parent": "3956"},
+    "6862": {"name": "安哥拉", "parent": "3956"},
+    "6863": {"name": "安圭拉", "parent": "3956"},
+    "6864": {"name": "南极洲", "parent": "3956"},
+    "6865": {"name": "安提瓜和巴布达", "parent": "3956"},
+    "6866": {"name": "阿根廷", "parent": "3956"},
+    "6867": {"name": "亚美尼亚", "parent": "3956"},
+    "6869": {"name": "奥地利", "parent": "3956"},
+    "6870": {"name": "阿塞拜疆", "parent": "3956"},
+    "6871": {"name": "巴哈马", "parent": "3956"},
+    "6872": {"name": "巴林", "parent": "3956"},
+    "6873": {"name": "孟加拉国", "parent": "3956"},
+    "6874": {"name": "巴巴多斯", "parent": "3956"},
+    "6875": {"name": "白俄罗斯", "parent": "3956"},
+    "6876": {"name": "比利时", "parent": "3956"},
+    "6877": {"name": "伯利兹", "parent": "3956"},
+    "6878": {"name": "贝宁", "parent": "3956"},
+    "6879": {"name": "百慕大", "parent": "3956"},
+    "6880": {"name": "不丹", "parent": "3956"},
+    "6881": {"name": "玻利维亚", "parent": "3956"},
+    "6882": {"name": "波黑", "parent": "3956"},
+    "6883": {"name": "博茨瓦纳", "parent": "3956"},
+    "6884": {"name": "布维岛", "parent": "3956"},
+    "6885": {"name": "巴西", "parent": "3956"},
+    "6886": {"name": "英属印度洋领土", "parent": "3956"},
+    "6887": {"name": "文莱", "parent": "3956"},
+    "6888": {"name": "保加利亚", "parent": "3956"},
+    "6889": {"name": "布基纳法索", "parent": "3956"},
+    "6890": {"name": "布隆迪", "parent": "3956"},
+    "6891": {"name": "柬埔寨", "parent": "3956"},
+    "6892": {"name": "喀麦隆", "parent": "3956"},
+    "6893": {"name": "佛得角", "parent": "3956"},
+    "6894": {"name": "开曼群岛", "parent": "3956"},
+    "6895": {"name": "中非", "parent": "3956"},
+    "6896": {"name": "乍得", "parent": "3956"},
+    "6897": {"name": "智利", "parent": "3956"},
+    "6898": {"name": "圣诞岛", "parent": "3956"},
+    "6899": {"name": "科科斯（基林）群岛", "parent": "3956"},
+    "6900": {"name": "哥伦比亚", "parent": "3956"},
+    "6901": {"name": "科摩罗", "parent": "3956"},
+    "6902": {"name": "刚果（布）", "parent": "3956"},
+    "6903": {"name": "刚果（金）", "parent": "3956"},
+    "6904": {"name": "库克群岛", "parent": "3956"},
+    "6905": {"name": "哥斯达黎加", "parent": "3956"},
+    "6906": {"name": "科特迪瓦", "parent": "3956"},
+    "6907": {"name": "克罗地亚", "parent": "3956"},
+    "6908": {"name": "古巴", "parent": "3956"},
+    "6909": {"name": "塞浦路斯", "parent": "3956"},
+    "6910": {"name": "捷克", "parent": "3956"},
+    "6911": {"name": "丹麦", "parent": "3956"},
+    "6912": {"name": "吉布提", "parent": "3956"},
+    "6913": {"name": "多米尼克", "parent": "3956"},
+    "6914": {"name": "多米尼加共和国", "parent": "3956"},
+    "6915": {"name": "东帝汶", "parent": "3956"},
+    "6916": {"name": "厄瓜多尔", "parent": "3956"},
+    "6917": {"name": "埃及", "parent": "3956"},
+    "6918": {"name": "萨尔瓦多", "parent": "3956"},
+    "6919": {"name": "赤道几内亚", "parent": "3956"},
+    "6920": {"name": "厄立特里亚", "parent": "3956"},
+    "6921": {"name": "爱沙尼亚", "parent": "3956"},
+    "6922": {"name": "埃塞俄比亚", "parent": "3956"},
+    "6923": {"name": "福克兰群岛（马尔维纳斯）", "parent": "3956"},
+    "6924": {"name": "法罗群岛", "parent": "3956"},
+    "6925": {"name": "斐济", "parent": "3956"},
+    "6926": {"name": "芬兰", "parent": "3956"},
+    "6927": {"name": "法属圭亚那", "parent": "3956"},
+    "6928": {"name": "法属波利尼西亚", "parent": "3956"},
+    "6929": {"name": "法属南部领土", "parent": "3956"},
+    "6930": {"name": "加蓬", "parent": "3956"},
+    "6931": {"name": "冈比亚", "parent": "3956"},
+    "6932": {"name": "格鲁吉亚", "parent": "3956"},
+    "6933": {"name": "加纳", "parent": "3956"},
+    "6934": {"name": "直布罗陀", "parent": "3956"},
+    "6935": {"name": "希腊", "parent": "3956"},
+    "6936": {"name": "格陵兰", "parent": "3956"},
+    "6937": {"name": "格林纳达", "parent": "3956"},
+    "6938": {"name": "瓜德罗普", "parent": "3956"},
+    "6939": {"name": "关岛", "parent": "3956"},
+    "6940": {"name": "危地马拉", "parent": "3956"},
+    "6941": {"name": "几内亚", "parent": "3956"},
+    "6942": {"name": "几内亚比绍", "parent": "3956"},
+    "6943": {"name": "圭亚那", "parent": "3956"},
+    "6944": {"name": "海地", "parent": "3956"},
+    "6945": {"name": "赫德岛和麦克唐纳岛", "parent": "3956"},
+    "6946": {"name": "洪都拉斯", "parent": "3956"},
+    "6947": {"name": "匈牙利", "parent": "3956"},
+    "6948": {"name": "冰岛", "parent": "3956"},
+    "6949": {"name": "印度", "parent": "3956"},
+    "6950": {"name": "印度尼西亚", "parent": "3956"},
+    "6951": {"name": "伊朗", "parent": "3956"},
+    "6952": {"name": "伊拉克", "parent": "3956"},
+    "6953": {"name": "爱尔兰", "parent": "3956"},
+    "6954": {"name": "以色列", "parent": "3956"},
+    "6955": {"name": "牙买加", "parent": "3956"},
+    "6956": {"name": "约旦", "parent": "3956"},
+    "6957": {"name": "哈萨克斯坦", "parent": "3956"},
+    "6958": {"name": "肯尼亚", "parent": "3956"},
+    "6959": {"name": "基里巴斯", "parent": "3956"},
+    "6960": {"name": "朝鲜", "parent": "3956"},
+    "6961": {"name": "科威特", "parent": "3956"},
+    "6962": {"name": "吉尔吉斯斯坦", "parent": "3956"},
+    "6963": {"name": "老挝", "parent": "3956"},
+    "6964": {"name": "拉脱维亚", "parent": "3956"},
+    "6965": {"name": "黎巴嫩", "parent": "3956"},
+    "6966": {"name": "莱索托", "parent": "3956"},
+    "6967": {"name": "利比里亚", "parent": "3956"},
+    "6968": {"name": "利比亚", "parent": "3956"},
+    "6969": {"name": "列支敦士登", "parent": "3956"},
+    "6970": {"name": "立陶宛", "parent": "3956"},
+    "6971": {"name": "卢森堡", "parent": "3956"},
+    "6972": {"name": "前南马其顿", "parent": "3956"},
+    "6973": {"name": "马达加斯加", "parent": "3956"},
+    "6974": {"name": "马拉维", "parent": "3956"},
+    "6975": {"name": "马尔代夫", "parent": "3956"},
+    "6976": {"name": "马里", "parent": "3956"},
+    "6977": {"name": "马耳他", "parent": "3956"},
+    "6978": {"name": "马绍尔群岛", "parent": "3956"},
+    "6979": {"name": "马提尼克", "parent": "3956"},
+    "6980": {"name": "毛里塔尼亚", "parent": "3956"},
+    "6981": {"name": "毛里求斯", "parent": "3956"},
+    "6982": {"name": "马约特", "parent": "3956"},
+    "6983": {"name": "墨西哥", "parent": "3956"},
+    "6984": {"name": "密克罗尼西亚联邦", "parent": "3956"},
+    "6985": {"name": "摩尔多瓦", "parent": "3956"},
+    "6986": {"name": "摩纳哥", "parent": "3956"},
+    "6987": {"name": "蒙古", "parent": "3956"},
+    "6988": {"name": "蒙特塞拉特", "parent": "3956"},
+    "6989": {"name": "摩洛哥", "parent": "3956"},
+    "6990": {"name": "莫桑比克", "parent": "3956"},
+    "6991": {"name": "缅甸", "parent": "3956"},
+    "6992": {"name": "纳米比亚", "parent": "3956"},
+    "6993": {"name": "瑙鲁", "parent": "3956"},
+    "6994": {"name": "尼泊尔", "parent": "3956"},
+    "6995": {"name": "荷兰", "parent": "3956"},
+    "6996": {"name": "荷属安的列斯", "parent": "3956"},
+    "6997": {"name": "新喀里多尼亚", "parent": "3956"},
+    "6998": {"name": "新西兰", "parent": "3956"},
+    "6999": {"name": "尼加拉瓜", "parent": "3956"},
+    "7000": {"name": "尼日尔", "parent": "3956"},
+    "7001": {"name": "尼日利亚", "parent": "3956"},
+    "7002": {"name": "纽埃", "parent": "3956"},
+    "7003": {"name": "诺福克岛", "parent": "3956"},
+    "7004": {"name": "北马里亚纳", "parent": "3956"},
+    "7005": {"name": "挪威", "parent": "3956"},
+    "7006": {"name": "阿曼", "parent": "3956"},
+    "7007": {"name": "巴基斯坦", "parent": "3956"},
+    "7008": {"name": "帕劳", "parent": "3956"},
+    "7009": {"name": "巴勒斯坦", "parent": "3956"},
+    "7010": {"name": "巴拿马", "parent": "3956"},
+    "7011": {"name": "巴布亚新几内亚", "parent": "3956"},
+    "7012": {"name": "巴拉圭", "parent": "3956"},
+    "7013": {"name": "秘鲁", "parent": "3956"},
+    "7014": {"name": "菲律宾", "parent": "3956"},
+    "7015": {"name": "皮特凯恩群岛", "parent": "3956"},
+    "7016": {"name": "波兰", "parent": "3956"},
+    "7017": {"name": "葡萄牙", "parent": "3956"},
+    "7018": {"name": "波多黎各", "parent": "3956"},
+    "7019": {"name": "卡塔尔", "parent": "3956"},
+    "7020": {"name": "留尼汪", "parent": "3956"},
+    "7021": {"name": "罗马尼亚", "parent": "3956"},
+    "7022": {"name": "俄罗斯联邦", "parent": "3956"},
+    "7023": {"name": "卢旺达", "parent": "3956"},
+    "7024": {"name": "圣赫勒拿", "parent": "3956"},
+    "7025": {"name": "圣基茨和尼维斯", "parent": "3956"},
+    "7026": {"name": "圣卢西亚", "parent": "3956"},
+    "7027": {"name": "圣皮埃尔和密克隆", "parent": "3956"},
+    "7028": {"name": "圣文森特和格林纳丁斯", "parent": "3956"},
+    "7029": {"name": "萨摩亚", "parent": "3956"},
+    "7030": {"name": "圣马力诺", "parent": "3956"},
+    "7031": {"name": "圣多美和普林西比", "parent": "3956"},
+    "7032": {"name": "沙特阿拉伯", "parent": "3956"},
+    "7033": {"name": "塞内加尔", "parent": "3956"},
+    "7034": {"name": "塞舌尔", "parent": "3956"},
+    "7035": {"name": "塞拉利昂", "parent": "3956"},
+    "7036": {"name": "新加坡", "parent": "3956"},
+    "7037": {"name": "斯洛伐克", "parent": "3956"},
+    "7038": {"name": "斯洛文尼亚", "parent": "3956"},
+    "7039": {"name": "所罗门群岛", "parent": "3956"},
+    "7040": {"name": "索马里", "parent": "3956"},
+    "7041": {"name": "南非", "parent": "3956"},
+    "7042": {"name": "南乔治亚岛和南桑德韦奇岛", "parent": "3956"},
+    "7043": {"name": "斯里兰卡", "parent": "3956"},
+    "7044": {"name": "苏丹", "parent": "3956"},
+    "7045": {"name": "苏里南", "parent": "3956"},
+    "7046": {"name": "斯瓦尔巴群岛", "parent": "3956"},
+    "7047": {"name": "斯威士兰", "parent": "3956"},
+    "7048": {"name": "瑞典", "parent": "3956"},
+    "7049": {"name": "瑞士", "parent": "3956"},
+    "7050": {"name": "叙利亚", "parent": "3956"},
+    "7051": {"name": "塔吉克斯坦", "parent": "3956"},
+    "7052": {"name": "坦桑尼亚", "parent": "3956"},
+    "7053": {"name": "泰国", "parent": "3956"},
+    "7054": {"name": "多哥", "parent": "3956"},
+    "7055": {"name": "托克劳", "parent": "3956"},
+    "7056": {"name": "汤加", "parent": "3956"},
+    "7057": {"name": "特立尼达和多巴哥", "parent": "3956"},
+    "7058": {"name": "突尼斯", "parent": "3956"},
+    "7059": {"name": "土耳其", "parent": "3956"},
+    "7060": {"name": "土库曼斯坦", "parent": "3956"},
+    "7061": {"name": "特克斯科斯群岛", "parent": "3956"},
+    "7062": {"name": "图瓦卢", "parent": "3956"},
+    "7063": {"name": "乌干达", "parent": "3956"},
+    "7064": {"name": "乌克兰", "parent": "3956"},
+    "7065": {"name": "阿联酋", "parent": "3956"},
+    "7066": {"name": "美国本土外小岛屿", "parent": "3956"},
+    "7067": {"name": "乌拉圭", "parent": "3956"},
+    "7068": {"name": "乌兹别克斯坦", "parent": "3956"},
+    "7069": {"name": "瓦努阿图", "parent": "3956"},
+    "7070": {"name": "梵蒂冈", "parent": "3956"},
+    "7071": {"name": "委内瑞拉", "parent": "3956"},
+    "7072": {"name": "越南", "parent": "3956"},
+    "7073": {"name": "英属维尔京群岛", "parent": "3956"},
+    "7074": {"name": "美属维尔京群岛", "parent": "3956"},
+    "7075": {"name": "瓦利斯和富图纳", "parent": "3956"},
+    "7076": {"name": "西撒哈拉", "parent": "3956"},
+    "7077": {"name": "也门", "parent": "3956"},
+    "7078": {"name": "南斯拉夫", "parent": "3956"},
+    "7079": {"name": "赞比亚", "parent": "3956"},
+    "7080": {"name": "津巴布韦", "parent": "3956"},
+    "7081": {"name": "塞尔维亚", "parent": "3956"},
+    "7082": {"name": "雄安新区", "parent": "4"},
+    "7084": {"name": "天门市", "parent": "18"},
+}
+
+NM_SET = set([v["name"] for _, v in TBL.items()])
+
+
+def get_names(id):
+    if not id or str(id).lower() == "none":
+        return []
+    id = str(id)
+    if not re.match("[0-9]+$", id.strip()):
+        return [id]
+    nms = []
+    d = TBL.get(id)
+    if not d:
+        return []
+    nms.append(d["name"])
+    p = get_names(d["parent"])
+    if p:
+        nms.extend(p)
+    return nms
+
+
+
+def isName(nm):
+    if nm in NM_SET:
+        return True
+    if nm + "市" in NM_SET:
+        return True
+    if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:
+        return True
+    return False
--- a/deepdoc/parser/resume/entities/res/corp.tks.freq.json
+++ b/deepdoc/parser/resume/entities/res/corp.tks.freq.json
@@ -0,0 +1,65 @@
+[
+        "科技",
+        "集团",
+        "网络科技",
+        "技术",
+        "信息",
+        "分公司",
+        "信息技术",
+        "发展",
+        "科技股份",
+        "网络",
+        "贸易",
+        "商贸",
+        "工程",
+        "企业",
+        "集团股份",
+        "商务",
+        "工业",
+        "控股集团",
+        "国际贸易",
+        "软件技术",
+        "数码科技",
+        "软件开发",
+        "有限",
+        "经营",
+        "科技开发",
+        "股份公司",
+        "电子技术",
+        "实业集团",
+        "责任",
+        "无限",
+        "工程技术",
+        "上市公司",
+        "技术开发",
+        "软件系统",
+        "总公司",
+        "网络服务",
+        "ltd.",
+        "technology",
+        "company",
+        "服务公司",
+        "计算机技术",
+        "计算机软件",
+        "电子信息",
+        "corporation",
+        "计算机服务",
+        "计算机系统",
+	"有限公司",
+	"事业部",
+	"公司",
+	"股份",
+	"有限责任",
+	"软件",
+	"控股",
+	"高科技",
+	"房地产",
+	"事业群",
+	"部门",
+	"电子商务",
+	"人力资源顾问",
+	"人力资源",
+	"株式会社",
+	"网络营销"
+]
+
--- a/deepdoc/parser/resume/entities/res/corp_baike_len.csv
+++ b/deepdoc/parser/resume/entities/res/corp_baike_len.csv
--- a/deepdoc/parser/resume/entities/res/corp_tag.json
+++ b/deepdoc/parser/resume/entities/res/corp_tag.json
--- a/deepdoc/parser/resume/entities/res/good_corp.json
+++ b/deepdoc/parser/resume/entities/res/good_corp.json
@@ -0,0 +1,911 @@
+[
+    "google assistant investments",
+    "amazon",
+    "dingtalk china information",
+    "zhejiang alibaba communication",
+    "yunos",
+    "腾讯云",
+    "新浪新闻",
+    "网邻通",
+    "蚂蚁集团",
+    "大疆",
+    "恒生股份",
+    "sf express",
+    "智者天下",
+    "shanghai hema network",
+    "papayamobile",
+    "lexinfintech",
+    "industrial consumer finance",
+    "360搜索",
+    "世纪光速",
+    "迅雷区块链",
+    "赛盒科技",
+    "齐力电子商务",
+    "平安养老险",
+    "平安证券",
+    "平安好贷",
+    "五八新服",
+    "呯嘭智能",
+    "阿里妈妈",
+    "mdt",
+    "tencent",
+    "weibo",
+    "浪潮软件",
+    "阿里巴巴广告",
+    "mashang consumer finance",
+    "维沃",
+    "hqg , limited",
+    "moodys",
+    "搜狐支付",
+    "百度秀",
+    "新浪服务",
+    "零售通",
+    "同城艺龙",
+    "虾米音乐",
+    "贝壳集团",
+    "小米有品",
+    "滴滴自动驾驶",
+    "图记",
+    "阿里影业",
+    "卓联软件",
+    "zhejiang tmall",
+    "谷歌中国",
+    "hithink flush",
+    "时装科技",
+    "程会玩国际旅行社",
+    "amazon china holding limited",
+    "中信消金",
+    "当当比特物流",
+    "新浪新媒体咨询",
+    "tongcheng network",
+    "金山在线",
+    "shopping cart",
+    "犀互动",
+    "五八",
+    "bilibili",
+    "阿里星球",
+    "滴滴金科服务",
+    "美团",
+    "哈啰出行",
+    "face",
+    "平安健康",
+    "招商银行",
+    "连亚",
+    "盒马网络",
+    "b站",
+    "华为机器",
+    "shanghai mdt infotech",
+    "ping an healthkonnect",
+    "beijing home link real estate broker",
+    "花海仓",
+    "beijing jingdong shangke information",
+    "微影智能",
+    "酷狗游戏",
+    "health.pingan.com",
+    "众安",
+    "陌陌",
+    "海康威视数字",
+    "同程网",
+    "艾丁金融",
+    "知乎",
+    " lu",
+    "国际商业机器公司",
+    "捷信消费金融",
+    "恒生利融",
+    "china merchants bank",
+    "企鹅电竞",
+    "捷信信驰",
+    "360智能家居",
+    "小桔车服",
+    "homecredit",
+    "皮皮虾",
+    "畅游",
+    "聚爱聊",
+    "suning.com",
+    "途牛旅游网",
+    "花呗",
+    "盈店通",
+    "sina",
+    "阿里巴巴音乐",
+    "华为技术有限公司",
+    "国付宝",
+    "shanghai lianshang network",
+    "oppo",
+    "华为投资控股",
+    "beijing sohu new media information",
+    "times square",
+    "菜鸟物流",
+    "lingxing",
+    "jd digits",
+    "同程旅游",
+    "分期乐",
+    "火锅视频",
+    "天天快报",
+    "猎豹移动",
+    "五八人力资源",
+    "宝宝树",
+    "顺丰科技",
+    "上海西翠",
+    "诗程文化传播",
+    "dewu",
+    "领星网络",
+    "aliexpress",
+    "贝塔通科技",
+    "链家",
+    "花小猪",
+    "趣输入",
+    "搜狐新媒体",
+    "一淘",
+    "56",
+    "qq阅读",
+    "青桔单车",
+    "iflytek",
+    "每日优鲜电子商务",
+    "腾讯觅影",
+    "微医",
+    "松果网",
+    "paypal",
+    "递瑞供应链管理",
+    "领星",
+    "qunar",
+    "三快",
+    "lu.com",
+    "携程旅行网",
+    "新潮传媒",
+    "链家经纪",
+    "景域文化",
+    "阿里健康",
+    "pingpeng",
+    "聚划算",
+    "零机科技",
+    "街兔电单车",
+    "快乐购",
+    "华为数字能源",
+    "搜狐",
+    "陆家嘴国际金融资产交易市场",
+    "nanjing tuniu",
+    "亚马逊",
+    "苏宁易购",
+    "携程旅游",
+    "苏宁金服",
+    "babytree",
+    "悟空问答",
+    "同花顺",
+    "eastmoney",
+    "浪潮信息",
+    "滴滴智慧交通",
+    "beijing ruixun lingtong",
+    "平安综合金融服务",
+    "爱奇艺",
+    "小米集团",
+    "华为云",
+    "微店",
+    "恒生集团",
+    "网易有道",
+    "boccfc",
+    "世纪思速科技",
+    "海康消防",
+    "beijing xiaomi",
+    "众安科技",
+    "五八同城",
+    "霆程汽车租赁",
+    "云卖分销",
+    "乐信集团",
+    "蚂蚁",
+    "舶乐蜜电子商务",
+    "支付宝中国",
+    "砖块消消消",
+    "vivo",
+    "阿里互娱",
+    "中国平安",
+    "lingxihudong",
+    "百度网盘",
+    "1号店",
+    "字节跳动",
+    "京东科技",
+    "驴妈妈兴旅国际旅行社",
+    "hangzhou alibaba music",
+    "xunlei",
+    "灵犀互动娱乐",
+    "快手",
+    "youtube",
+    "连尚慧眼",
+    "腾讯体育",
+    "爱商在线",
+    "酷我音乐",
+    "金融壹账通",
+    "搜狗服务",
+    "banma information",
+    "a站",
+    "罗汉堂",
+    "薇仕网络",
+    "搜狐新闻",
+    "贝宝",
+    "薇仕",
+    "口袋时尚科技",
+    "穆迪咨询",
+    "新狐投资管理",
+    "hikvision",
+    "alimama china holding limited",
+    "超聚变数字",
+    "腾讯视频",
+    "恒生电子",
+    "百度游戏",
+    "绿洲",
+    "木瓜移动",
+    "红袖添香",
+    "店匠科技",
+    "易贝",
+    "一淘网",
+    "博览群书",
+    "唯品会",
+    "lazglobal",
+    "amap",
+    "芒果网",
+    "口碑",
+    "海康慧影",
+    "腾讯音乐娱乐",
+    "网易严选",
+    "微信",
+    "shenzhen lexin holding",
+    "hangzhou pingpeng intelligent",
+    "连尚网络",
+    "海思",
+    "isunor",
+    "蝉翼",
+    "阿里游戏",
+    "广州优视",
+    "优视",
+    "腾讯征信",
+    "识装",
+    "finserve.pingan.com",
+    "papaya",
+    "阅文",
+    "平安健康保险",
+    "考拉海购",
+    "网易印象",
+    "wifi万能钥匙",
+    "新浪互联服务",
+    "亚马逊云科技",
+    "迅雷看看",
+    "华为朗新科技",
+    "adyen hong kong limited",
+    "谷歌",
+    "得物",
+    "网心",
+    "cainiao network",
+    "沐瞳",
+    "linkedln",
+    "hundsun",
+    "阿里旅行",
+    "珍爱网",
+    "阿里巴巴通信",
+    "金山奇剑",
+    "tongtool",
+    "华为安捷信电气",
+    "快乐时代",
+    "平安寿险",
+    "微博",
+    "微跳蚤",
+    "oppo移动通信",
+    "毒",
+    "alimama",
+    "shoplazza",
+    "shenzhen dianjiang science and",
+    "众鸣世科",
+    "平安金融",
+    "狐友",
+    "维沃移动通信",
+    "tobosoft",
+    "齐力电商",
+    "ali",
+    "诚信通",
+    "行吟",
+    "跳舞的线",
+    "橙心优选",
+    "众安健康",
+    "亚马逊中国投资",
+    "德絮投资管理中心合伙",
+    "招联消费金融",
+    "百度文学",
+    "芝麻信用",
+    "阿里零售通",
+    "时装",
+    "花样直播",
+    "sogou",
+    "uc",
+    "海思半导体",
+    "zhongan online p&c insurance",
+    "新浪数字",
+    "驴妈妈旅游网",
+    "华为数字能源技术",
+    "京东数科",
+    "oracle",
+    "xiaomi",
+    "nyse",
+    "阳光消费金融",
+    "天天动听",
+    "大众点评",
+    "上海瑞家",
+    "trustpass",
+    "hundsun technologies",
+    "美团小贷",
+    "ebay",
+    "通途",
+    "tcl",
+    "鸿蒙",
+    "酷狗计算机",
+    "品诺保险",
+    "capitalg",
+    "康盛创想",
+    "58同城",
+    "闲鱼",
+    "微软",
+    "吉易付科技",
+    "理财通",
+    "ctrip",
+    "yy",
+    "华为数字",
+    "kingsoft",
+    "孙宁金融",
+    "房江湖经纪",
+    "youku",
+    "ant financial services group",
+    "盒马",
+    "sensetime",
+    "伊千网络",
+    "小豹ai翻译棒",
+    "shopify",
+    "前海微众银行",
+    "qd",
+    "gmail",
+    "pingpong",
+    "alibaba group holding limited",
+    "捷信时空电子商务",
+    "orientsec",
+    "乔戈里管理咨询",
+    "ant",
+    "锐讯灵通",
+    "兴业消费金融",
+    "京东叁佰陆拾度电子商务",
+    "新浪",
+    "优酷土豆",
+    "海康机器人",
+    "美团单车",
+    "海康存储",
+    "领英",
+    "阿里全球速卖通",
+    "美菜网",
+    "京邦达",
+    "安居客",
+    "阿里体育",
+    "相互宝",
+    "cloudwalk",
+    "百度智能云",
+    "贝壳",
+    "酷狗",
+    "sunshine consumer finance",
+    "掌宜",
+    "奇酷网",
+    "核新同花顺",
+    "阿里巴巴影业",
+    "节创",
+    "学而思网校",
+    "速途",
+    "途牛",
+    "阿里云计算",
+    "beijing sensetime",
+    "alibaba cloud",
+    "西瓜视频",
+    "美团优选",
+    "orient securities limited",
+    "华为朗新",
+    "店匠",
+    "shanghai weishi network",
+    "友盟",
+    "飞猪旅行",
+    "滴滴出行",
+    "alipay",
+    "mogu",
+    "dangdang",
+    "大麦网",
+    "汉军智能系统",
+    "百度地图",
+    "货车帮",
+    "狐狸金服",
+    "众安在线保险经纪",
+    "华为通信",
+    "新浪支付",
+    "zhihu",
+    "alibaba cloud computing",
+    "沙发视频",
+    "金山软件",
+    "ping an good doctor",
+    "携程",
+    "脉脉",
+    "youku information beijing",
+    "zhongan",
+    "艾丁软件",
+    "乒乓智能",
+    "蘑菇街",
+    "taobao",
+    "华为技术服务",
+    "仕承文化传播",
+    "安捷信",
+    "狐狸互联网小额贷款",
+    "节点迅捷",
+    "中国银行",
+    "搜镇",
+    "众安在线",
+    "dingtalk",
+    "云从科技",
+    "beijing jingbangda trade",
+    "moody s",
+    "滚动的天空",
+    "yl.pingan.com",
+    "奇虎",
+    "alihealth",
+    "芒果tv",
+    "lufax",
+    "美团打车",
+    "小桔",
+    "贝壳找房网",
+    "小米科技",
+    "vips",
+    "kindle",
+    "亚马逊服务",
+    "citic consumer finance",
+    "微众",
+    "搜狗智慧互联网医院",
+    "盒马鲜生",
+    "life.pinan.com",
+    "ph.com.cn",
+    "银联",
+    "cmbchina",
+    "平安金融科技咨询",
+    "微保",
+    "甲骨文中国",
+    "飞书",
+    "koubei shanghai information",
+    "企鹅辅导",
+    "斑马",
+    "平安租赁",
+    "云从",
+    "马上消费",
+    "hangzhou ali baba advertising",
+    "金山",
+    "赛盒",
+    "科大讯飞",
+    "金星创业投资",
+    "平安国际融资租赁",
+    "360你财富",
+    "西山居",
+    "shenzhen qianhai fourth paradigm data",
+    "海思光电子",
+    "猎户星空",
+    "网易公司",
+    "浪潮",
+    "粒粒橙传媒",
+    "招联金融",
+    "100. me",
+    "捷信信驰咨询",
+    "唯品仓",
+    "orient",
+    "趣拿",
+    "摩拜单车",
+    "天猫精灵",
+    "菜鸟",
+    "豹小贩",
+    "去哪儿",
+    "米家",
+    "哈啰单车",
+    "搜狐体育",
+    "shopify payments usa",
+    "高德软件",
+    "讯联智付",
+    "乐信",
+    "唯你搭",
+    "第四范式",
+    "菜鸟网络",
+    "同程",
+    "yy语音",
+    "浪潮云",
+    "东财",
+    "淘宝",
+    "寻梦",
+    "citic securities limited",
+    "青橙之旅",
+    "阿里巴巴",
+    "番茄小说",
+    "上海亿贝",
+    "inspur",
+    "babytree inc",
+    "海康智慧产业股权投资基金合伙合伙",
+    "adyen",
+    "艺龙",
+    "蚂蚁金服",
+    "平安金服",
+    "百度百科",
+    "unionpay",
+    "当当",
+    "阅文集团",
+    "东方财富",
+    "东方证券",
+    "哈罗单车",
+    "优酷",
+    "海康",
+    "alipay china network",
+    "网商银行",
+    "钧正",
+    "property.pingan.com",
+    "豹咖啡",
+    "网易",
+    "我爱cba",
+    "theduapp",
+    "360",
+    "金山数字娱乐",
+    "新浪阅读",
+    "alibabagames",
+    "顺丰",
+    "支点商贸",
+    "同程旅行",
+    "citic securities",
+    "ele.com",
+    "tal",
+    "fresh hema",
+    "运满满",
+    "贝壳网",
+    "酷狗音乐",
+    "鲜城",
+    "360健康",
+    "浪潮世科",
+    "迅雷网络",
+    "哔哩哔哩",
+    "华为电动",
+    "淘友天下",
+    "华多网络",
+    "xunlei networking technologies",
+    "云杉",
+    "当当网电子商务",
+    "津虹网络",
+    "wedoc cloud hangzhou holdings",
+    "alisports shanghai",
+    "旷视金智",
+    "钉钉中国",
+    "微影",
+    "金山快快",
+    "亿贝",
+    "wedoc",
+    "autonavi",
+    "哈啰助力车",
+    "google cloud",
+    "新浪乐居",
+    "京东股票",
+    "搜狗智慧远程医疗中心",
+    "中银消金",
+    "merchants union consumer finance",
+    "王者荣耀",
+    "百度手机",
+    "美团民宿",
+    "kaola",
+    "小屋",
+    "金山网络",
+    "来往",
+    "顺丰速运",
+    "腾讯课堂",
+    "百度在线网络",
+    "美团买菜",
+    "威视汽车",
+    "uc mobile",
+    "来赞达",
+    "平安健康医疗",
+    "豹小秘",
+    "尚网",
+    "哈勃投资",
+    " ping an insurance group of china ,",
+    "小米",
+    "360好药",
+    "qq音乐",
+    "lingxigames",
+    "faceu激萌",
+    "搜狗",
+    "sohu",
+    "满帮",
+    "vipshop",
+    "wishpost",
+    "金山世游",
+    "shanghai yibaimi network",
+    "1688",
+    "海康汽车",
+    "顺丰控股",
+    "华为",
+    "妙镜vr",
+    "paybkj.com",
+    "hellobike",
+    "豹来电",
+    "京东",
+    "驴妈妈",
+    "momo",
+    "平安健康险",
+    "哈勃科技",
+    "美菜",
+    "众安在线财产保险",
+    "海康威视",
+    "east money information",
+    "阿里云",
+    "蝉游记",
+    "余额宝",
+    "屋客",
+    "滴滴",
+    "shopify international limited",
+    "百度",
+    "阿里健康中国",
+    "阿里通信",
+    "微梦创科",
+    "微医云",
+    "轻颜相机",
+    "搜易居",
+    "趣店集团",
+    "美团云",
+    "ant group",
+    "金山云",
+    "beijing express hand",
+    "觅觅",
+    "支付宝",
+    "滴滴承信科技咨询服务",
+    "拼多多",
+    "众安运动",
+    "乞力电商",
+    "youcash",
+    "唯品金融",
+    "陆金所",
+    "本地生活",
+    "sz dji",
+    "海康智能",
+    "魔方网聘",
+    "青藤大学",
+    "international business machines",
+    "学而思",
+    "beijing zhongming century science and",
+    "猎豹清理大师",
+    "asinking",
+    "高德",
+    "苏宁",
+    "优酷网",
+    "艾丁",
+    "中银消费金融",
+    "京东健康",
+    "五八教育",
+    "pingpongx",
+    "搜狐时尚",
+    "阿里广告",
+    "平安财险",
+    "中邮消金",
+    "etao",
+    "怕怕",
+    "nyse:cmcm",
+    "华为培训中心",
+    "高德地图",
+    "云狐天下征信",
+    "大疆创新",
+    "连尚",
+    "壹佰米",
+    "康健公司",
+    "iqiyi.com",
+    "360安全云盘",
+    "馒头直播",
+    "淘友网",
+    "东方赢家",
+    "bank of china",
+    "微众银行",
+    "阿里巴巴国际站",
+    "虾米",
+    "去哪儿网",
+    "ctrip travel network shanghai",
+    "潇湘书院",
+    "腾讯",
+    "快乐阳光互动娱乐传媒",
+    "迅雷",
+    "weidian",
+    "滴滴货运",
+    "ping an puhui enterprise management",
+    "新浪仓石基金销售",
+    "搜狐焦点",
+    "alibaba pictures",
+    "wps",
+    "平安",
+    "lazmall",
+    "百度开放平台",
+    "兴业消金",
+    " 珍爱网",
+    "京东云",
+    "小红书",
+    "1688. com",
+    "如视智数",
+    "missfresh",
+    "pazl.pingan.cn",
+    "平安集团",
+    "kugou",
+    "懂车帝",
+    "斑马智行",
+    "浪潮集团",
+    "netease hangzhou network",
+    "pagd.net",
+    "探探",
+    "chinaliterature",
+    "amazon亚马逊",
+    "alphabet",
+    "当当文创手工艺品电子商务",
+    "五八邦",
+    "shenzhen zhenai network information",
+    "lingshoutong",
+    "字节",
+    "lvmama",
+    "金山办公",
+    "众安保险",
+    "时装信息",
+    "优视科技",
+    "guangzhou kugou",
+    "ibm",
+    "滴滴打车",
+    "beijing sogou information service",
+    "megvii",
+    "健谈哥",
+    "cloudwalk group",
+    "蜂联科技",
+    "冬云",
+    "京东尚科",
+    "钢琴块2",
+    "京东世纪",
+    "商汤",
+    "众鸣世纪",
+    "腾讯音乐",
+    "迅雷网文化",
+    "华为云计算技术",
+    "live.me",
+    "全球速卖通",
+    "快的打车",
+    "hello group inc",
+    "美丽说",
+    "suning",
+    "opengauss",
+    "lazada",
+    "tmall",
+    "acfun",
+    "当当网",
+    "中银",
+    "旷视科技",
+    "百度钱包",
+    "淘宝网",
+    "新浪微博",
+    "迅雷集团",
+    "中信消费金融",
+    "学而思教育",
+    "平安普惠",
+    "悟空跨境",
+    "irobotbox",
+    "平安产险",
+    "inspur group",
+    "世纪卓越快递服务",
+    "奇虎360",
+    "webank",
+    "偶藻",
+    "唯品支付",
+    "腾讯云计算",
+    "众安服务",
+    "亿之唐",
+    "beijing 58 information ttechnology",
+    "平安好医生",
+    "迅雷之锤",
+    "旅行小账本",
+    "芒果游戏",
+    "新浪传媒",
+    "旷镜博煊",
+    "全民k歌",
+    "滴滴支付",
+    "北京网心科技",
+    "挂号网",
+    "萤石",
+    "chinavision media group limited",
+    "猎豹安全大师",
+    "cmcm",
+    "趣店",
+    "蚂蚁财富",
+    "商汤科技",
+    "甲骨文",
+    "百度云",
+    "百度apollo",
+    "19 pay",
+    "stock.pingan.com",
+    "tiktok",
+    "alibaba pictures group limited",
+    "ele",
+    "考拉",
+    "天猫",
+    "腾讯优图",
+    "起点中文网",
+    "百度视频",
+    "shanghai bili bili",
+    "京东物流",
+    "ebay marketplaces gmbh",
+    "alibaba sport",
+    "wish",
+    "阿里巴巴中国",
+    "中国银联",
+    "alibaba china network",
+    "china ping an property insurance",
+    "百度糯米网",
+    "微软中国",
+    "一九付",
+    "4 paradigm",
+    "叮咚买菜",
+    "umeng",
+    "众鸣科技",
+    "平安财富通",
+    "google",
+    "巨量引擎",
+    "百度贴吧",
+    "beijing jingdong century information",
+    "讯飞",
+    "beijing yunshan information",
+    "满运软件",
+    "中邮消费金融",
+    "饿了么",
+    "alios",
+    "腾讯ai实验室",
+    "第四范式智能",
+    "瀚星创业投资",
+    "gradient ventures",
+    "microsoft",
+    "哈啰共享汽车",
+    "乞力电子商务",
+    "mscf",
+    "网易影业文化",
+    "铁友旅游咨询",
+    "kilimall",
+    "云企互联投资",
+    "ping an financial consulting",
+    "beijng jingdong century commerce",
+    "高德威智能交通系统",
+    "中友信息",
+    "平安医疗健康管理",
+    "eciticcfc",
+    "中信证券",
+    "fliggy",
+    "电子湾",
+    "旷云金智",
+    "微粒贷",
+    "rsi",
+    "滴滴云计算",
+    "google ventures",
+    "箐程",
+    "每日优鲜",
+    "音兔",
+    "拉扎斯",
+    "今日头条",
+    "乐信控股",
+    "猎豹浏览器",
+    "细微咨询",
+    "好未来",
+    "我乐",
+    "绘声绘色",
+    "抖音",
+    "搜狐新时代",
+    "飞猪",
+    "鹅厂",
+    "贝壳找房",
+    "tuniu",
+    "红马传媒文化",
+    "钉钉",
+    "马上消费金融",
+    "360手机",
+    "平安医保",
+    "快途",
+    "alibaba",
+    "小哈换电",
+    "大麦",
+    "恒睿人工智能研究院",
+    "谷歌资本",
+    "猎豹",
+    "穆迪信息"
+]
--- a/deepdoc/parser/resume/entities/res/good_sch.json
+++ b/deepdoc/parser/resume/entities/res/good_sch.json
@@ -0,0 +1,595 @@
+[
+"中国科技大学",
+"国防科学技术大学",
+"清华大学",
+"清华",
+"tsinghua university",
+"thu",
+"北京大学",
+"北大",
+"beijing university",
+"pku",
+"中国科学技术大学",
+"中国科大",
+"中科大",
+"china science & technology university",
+"ustc",
+"复旦大学",
+"复旦",
+"fudan university",
+"fdu",
+"中国人民大学",
+"人大",
+"人民大学",
+"renmin university of china",
+"ruc",
+"上海交通大学",
+"上海交大",
+"shanghai jiao tong university",
+"sjtu",
+"南京大学",
+"南大",
+"nanjing university",
+"nju",
+"同济大学",
+"同济",
+"tongji university",
+"tongji",
+"浙江大学",
+"浙大",
+"zhejiang university",
+"zju",
+"南开大学",
+"南开",
+"nankai university",
+"nku",
+"北京航空航天大学",
+"北航",
+"beihang university",
+"buaa",
+"北京师范大学",
+"北师",
+"北师大",
+"beijing normal university",
+"bnu",
+"武汉大学",
+"武大",
+"wuhan university",
+"whu",
+"西安交通大学",
+"西安交大",
+"xi’an jiaotong university",
+"xjtu",
+"天津大学",
+"天大",
+"university of tianjin",
+"tju",
+"华中科技大学",
+"华中大",
+"central china university science and technology",
+"hust",
+"北京理工大学",
+"北理",
+"beijing institute of technology",
+"bit",
+"东南大学",
+"东大",
+"southeast china university",
+"seu",
+"中山大学",
+"中大",
+"zhongshan university",
+"sysu",
+"华东师范大学",
+"华师大",
+"east china normal university",
+"ecnu",
+"哈尔滨工业大学",
+"哈工大",
+"harbin institute of technology",
+"hit",
+"厦门大学",
+"厦大",
+"xiamen university",
+"xmu",
+"西北工业大学",
+"西工大",
+"西北工大",
+"northwestern polytechnical university",
+"npu",
+"中南大学",
+"中南",
+"middle and southern university",
+"csu",
+"大连理工大学",
+"大工",
+"institute of technology of dalian",
+"dut",
+"四川大学",
+"川大",
+"sichuan university",
+"scu",
+"电子科技大学",
+"电子科大",
+"university of electronic science and technology of china",
+"uestc",
+"华南理工大学",
+"华南理工",
+"institutes of technology of south china",
+"scut",
+"吉林大学",
+"吉大",
+"jilin university",
+"jlu",
+"湖南大学",
+"湖大",
+"hunan university",
+"hnu",
+"重庆大学",
+"重大",
+"university of chongqing",
+"cqu",
+"山东大学",
+"山大",
+"shandong university",
+"sdu",
+"中国农业大学",
+"中国农大",
+"china agricultural university",
+"cau",
+"中国海洋大学",
+"中国海大",
+"chinese marine university",
+"ouc",
+"中央民族大学",
+"中央民大",
+"central university for nationalities",
+"muc",
+"东北大学",
+"东北工学院",
+"northeastern university",
+"neu 或 nu",
+"兰州大学",
+"兰大",
+"lanzhou university",
+"lzu",
+"西北农林科技大学",
+"西农","西北农大",
+"northwest a&f university",
+"nwafu",
+"中国人民解放军国防科技大学",
+"国防科技大学","国防科大",
+"national university of defense technology",
+"nudt",
+"郑州大学",
+"郑大",
+"zhengzhou university",
+"zzu",
+"云南大学",
+"云大",
+"yunnan university",
+"ynu",
+"新疆大学",
+"新大",
+"xinjiang university",
+"xju",
+"北京交通大学",
+"北京交大",
+"beijing jiaotong university",
+"bjtu",
+"北京工业大学",
+"北工大",
+"beijing university of technology",
+"bjut",
+"北京科技大学",
+"北科大","北京科大",
+"university of science and technology beijing",
+"ustb",
+"北京化工大学",
+"北化",
+"beijing university of chemical technology",
+"buct",
+"北京邮电大学",
+"北邮",
+"beijing university of posts and telecommunications",
+"beijing university of post and telecommunications",
+"beijing university of post and telecommunication",
+"beijing university of posts and telecommunication",
+"bupt",
+"北京林业大学",
+"北林",
+"beijing forestry university",
+"bfu",
+"北京协和医学院",
+"协和医学院",
+"peking union medical college",
+"pumc",
+"北京中医药大学",
+"北中医",
+"beijing university of chinese medicine",
+"bucm",
+"首都师范大学",
+"首师大",
+"capital normal university",
+"cnu",
+"北京外国语大学",
+"北外",
+"beijing foreign studies university",
+"bfsu",
+"中国传媒大学",
+"中媒",
+"中传",
+"北京广播学院",
+"communication university of china",
+"cuc",
+"中央财经大学",
+"中央财大",
+"中财大",
+"the central university of finance and economics",
+"cufe",
+"对外经济贸易大学",
+"对外经贸大学",
+"贸大",
+"university of international business and economics",
+"uibe",
+"外交学院",
+"外院",
+"china foreign affairs university",
+"cfau",
+"中国人民公安大学",
+"公安大学",
+"people's public security university of china",
+"ppsuc",
+"北京体育大学",
+"北体大",
+"beijing sport university",
+"bsu",
+"中央音乐学院",
+"央音",
+"中央院",
+"central conservatory of music",
+"ccom",
+"中国音乐学院",
+"国音",
+"中国院",
+"china conservatory of music",
+"ccmusic",
+"中央美术学院",
+"央美",
+"central academy of fine art",
+"cafa",
+"中央戏剧学院",
+"中戏",
+"the central academy of drama",
+"tcad",
+"中国政法大学",
+"法大",
+"china university of political science and law",
+"zuc",
+"cupl",
+"中国科学院大学",
+"国科大",
+"科院大",
+"university of chinese academy of sciences",
+"ucas",
+"福州大学",
+"福大",
+"university of fuzhou",
+"fzu",
+"暨南大学",
+"暨大",
+"ji'nan university",
+"jnu",
+"广州中医药大学",
+"广中医",
+"traditional chinese medicine university of guangzhou",
+"gucm",
+"华南师范大学",
+"华南师大",
+"south china normal university",
+"scnu",
+"广西大学",
+"西大",
+"guangxi university",
+"gxu",
+"贵州大学",
+"贵大",
+"guizhou university",
+"gzu",
+"海南大学",
+"海大",
+"university of hainan",
+"hainu",
+"河南大学",
+"河大",
+"he'nan university",
+"henu",
+"哈尔滨工程大学",
+"哈工程",
+"harbin engineering university",
+"heu",
+"东北农业大学",
+"东北农大",
+"northeast agricultural university",
+"neau",
+"东北林业大学",
+"东北林大",
+"northeast forestry university",
+"nefu",
+"中国地质大学",
+"地大",
+"china university of geosciences",
+"cug",
+"武汉理工大学",
+"武汉理工",
+"wuhan university of technology",
+"wut",
+"华中农业大学",
+"华中农大",
+"华农",
+"central china agricultural university",
+"hzau",
+"华中师范大学",
+"华中师大",
+"华大",
+"central china normal university",
+"ccnu",
+"中南财经政法大学",
+"中南大",
+"zhongnan university of economics & law",
+"zuel",
+"湖南师范大学",
+"湖南师大",
+"hunan normal university",
+"hunnu",
+"延边大学",
+"延大",
+"yanbian university",
+"ybu",
+"东北师范大学",
+"东北师大",
+"northeast normal university",
+"nenu",
+"苏州大学",
+"苏大",
+"soochow university",
+"suda",
+"南京航空航天大学",
+"南航",
+"nanjing aero-space university",
+"nuaa",
+"南京理工大学",
+"南理工",
+"institutes of technology of nanjing",
+"njust",
+"中国矿业大学",
+"中国矿大",
+"china mining university",
+"cumt",
+"南京邮电大学",
+"南邮",
+"nanjing university of posts and telecommunications",
+"njupt",
+"河海大学",
+"河海",
+"river sea university",
+"hhu",
+"江南大学",
+"江南大",
+"jiangnan university",
+"jiangnan",
+"南京林业大学",
+"南林",
+"nanjing forestry university",
+"njfu",
+"南京信息工程大学",
+"南信大",
+"nanjing university of information science and technology",
+"nuist",
+"南京农业大学",
+"南农",
+"南农大",
+"南京农大",
+"agricultural university of nanjing",
+"njau",
+"nau",
+"南京中医药大学",
+"南中医",
+"nanjing university of chinese medicine",
+"njucm",
+"中国药科大学",
+"中国药大",
+"china medicine university",
+"cpu",
+"南京师范大学",
+"南京师大",
+"南师大",
+"南师",
+"nanjing normal university",
+"nnu",
+"南昌大学",
+"昌大",
+"university of nanchang","nanchang university",
+"ncu",
+"辽宁大学",
+"辽大",
+"liaoning university",
+"lnu",
+"大连海事大学",
+"大连海大",
+"海大",
+"maritime affairs university of dalian",
+"dmu",
+"内蒙古大学",
+"内大",
+"university of the inner mongol","inner mongolia university",
+"imu",
+"宁夏大学",
+"宁大",
+"ningxia university",
+"nxu",
+"青海大学",
+"清大",
+"qinghai university",
+"qhu",
+"中国石油大学",
+"中石大",
+"china university of petroleum beijing",
+"upc",
+"太原理工大学",
+"太原理工",
+"institutes of technology of taiyuan","taiyuan university of technology",
+"tyut",
+"西北大学",
+"西大",
+"northwest university",
+"nwu",
+"西安电子科技大学",
+"西电",
+"xidian university",
+"xdu",
+"长安大学",
+"长大",
+"chang`an university",
+"chu",
+"陕西师范大学",
+"陕西师大",
+"陕师大",
+"shaanxi normal university",
+"snnu",
+"第四军医大学",
+"空军军医大学","四医大",
+"air force medical university",
+"fmmu",
+"华东理工大学",
+"华理",
+"east china university of science",
+"ecust",
+"东华大学",
+"东华",
+"donghua university",
+"dhu",
+"上海海洋大学",
+"上海海大",
+"shanghai ocean university",
+"shou",
+"上海中医药大学",
+"上中医",
+"shanghai university of traditional chinese medicine",
+"shutcm",
+"上海外国语大学",
+"上外",
+"shanghai international studies university",
+"sisu",
+"上海财经大学",
+"上海财大",
+"上财",
+"shanghai university of finance",
+"sufe",
+"上海体育学院",
+"shanghai university of sport",
+"上海音乐学院",
+"上音",
+"shanghai conservatory of music",
+"shcm",
+"上海大学",
+"上大",
+"shanghai university",
+"第二军医大学",
+"海军军医大学",
+"naval medical university",
+"西南交通大学",
+"西南交大",
+"southwest jiaotong university",
+"swjtu",
+"西南石油大学",
+"西南石大",
+"southwest petroleum university",
+"swpu",
+"成都理工大学",
+"成都理工",
+"chengdu university of technology",
+"cdut ",
+"四川农业大学",
+"川农",
+"川农大",
+"sichuan agricultural university",
+"sicau",
+"成都中医药大学",
+"成中医",
+"chengdu university of tcm",
+"cdutcm",
+"西南财经大学",
+"西南财大",
+"西财",
+"southwestern university of finance and economics",
+"swufe",
+"天津工业大学",
+"天工大",
+"tianjin university of technology",
+"tgu",
+"天津医科大学",
+"天津医大",
+"medical university of tianjin",
+"tmu",
+"天津中医药大学",
+"天中",
+"tianjin university of traditional chinese medicine",
+"tutcm",
+"华北电力大学",
+"华电",
+"north china electric power university",
+"ncepu",
+"河北工业大学",
+"河工大",
+"hebei university of technology",
+"hebut",
+"西藏大学",
+"藏大",
+"tibet university",
+"tu",
+"石河子大学",
+"石大",
+"shihezi university",
+"中国美术学院",
+"中国美院",
+"国美",
+"china academy of art",
+"caa",
+"宁波大学",
+"宁大",
+"ningbo university",
+"nbu",
+"西南大学",
+"西大",
+"southwest university",
+"swu",
+"安徽大学",
+"安大",
+"university of anhui",
+"ahu",
+"合肥工业大学",
+"合肥工大",
+"合工大",
+"hefei university of technology",
+"hfut",
+"中国地质大学",
+"地大",
+"china university of geosciences",
+"cug",
+"中国地质大学",
+"地大",
+"北京地大",
+"cugb",
+"中国矿业大学",
+"中国矿大",
+"china university of mining & technology",
+"cumtb",
+"中国石油大学",
+"中石大",
+"石大",
+"china university of petroleum",
+"cup",
+"中国石油大学",
+"中石大",
+"cup"]
--- a/deepdoc/parser/resume/entities/res/school.rank.csv
+++ b/deepdoc/parser/resume/entities/res/school.rank.csv
--- a/deepdoc/parser/resume/entities/res/schools.csv
+++ b/deepdoc/parser/resume/entities/res/schools.csv
--- a/deepdoc/parser/resume/entities/schools.py
+++ b/deepdoc/parser/resume/entities/schools.py
@@ -0,0 +1,91 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import json
+import re
+import copy
+import pandas as pd
+
+current_file_path = os.path.dirname(os.path.abspath(__file__))
+TBL = pd.read_csv(
+    os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
+).fillna("")
+TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
+GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r",encoding="utf-8"))
+GOOD_SCH = set([re.sub(r"[,. &（）()]+", "", c) for c in GOOD_SCH])
+
+
+def loadRank(fnm):
+    global TBL
+    TBL["rank"] = 1000000
+    with open(fnm, "r", encoding="utf-8") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            line = line.strip("\n").split(",")
+            try:
+                nm, rk = line[0].strip(), int(line[1])
+                # assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
+                TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
+            except Exception:
+                pass
+
+
+loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
+
+
+def split(txt):
+    tks = []
+    for t in re.sub(r"[ \t]+", " ", txt).split():
+        if (
+            tks
+            and re.match(r".*[a-zA-Z]$", tks[-1])
+            and re.match(r"[a-zA-Z]", t)
+            and tks
+        ):
+            tks[-1] = tks[-1] + " " + t
+        else:
+            tks.append(t)
+    return tks
+
+
+def select(nm):
+    global TBL
+    if not nm:
+        return
+    if isinstance(nm, list):
+        nm = str(nm[0])
+    nm = split(nm)[0]
+    nm = str(nm).lower().strip()
+    nm = re.sub(r"[(（][^()（）]+[)）]", "", nm.lower())
+    nm = re.sub(r"(^the |[,.&（）();；·]+|^(英国|美国|瑞士))", "", nm)
+    nm = re.sub(r"大学.*学院", "大学", nm)
+    tbl = copy.deepcopy(TBL)
+    tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
+    res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
+    if res.empty:
+        return
+
+    return json.loads(res.to_json(orient="records"))[0]
+
+
+def is_good(nm):
+    global GOOD_SCH
+    nm = re.sub(r"[(（][^()（）]+[)）]", "", nm.lower())
+    nm = re.sub(r"[''`‘’“”,. &（）();；]+", "", nm)
+    return nm in GOOD_SCH
--- a/deepdoc/parser/resume/step_one.py
+++ b/deepdoc/parser/resume/step_one.py
@@ -0,0 +1,189 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import json
+from deepdoc.parser.resume.entities import degrees, regions, industries
+
+FIELDS = [
+"address STRING",
+"annual_salary int",
+"annual_salary_from int",
+"annual_salary_to int",
+"birth STRING",
+"card STRING",
+"certificate_obj string",
+"city STRING",
+"corporation_id int",
+"corporation_name STRING",
+"corporation_type STRING",
+"degree STRING",
+"discipline_name STRING",
+"education_obj string",
+"email STRING",
+"expect_annual_salary int",
+"expect_city_names string",
+"expect_industry_name STRING",
+"expect_position_name STRING",
+"expect_salary_from int",
+"expect_salary_to int",
+"expect_type STRING",
+"gender STRING",
+"industry_name STRING",
+"industry_names STRING",
+"is_deleted STRING",
+"is_fertility STRING",
+"is_house STRING",
+"is_management_experience STRING",
+"is_marital STRING",
+"is_oversea STRING",
+"language_obj string",
+"name STRING",
+"nation STRING",
+"phone STRING",
+"political_status STRING",
+"position_name STRING",
+"project_obj string",
+"responsibilities string",
+"salary_month int",
+"scale STRING",
+"school_name STRING",
+"self_remark string",
+"skill_obj string",
+"title_name STRING",
+"tob_resume_id STRING",
+"updated_at Timestamp",
+"wechat STRING",
+"work_obj string",
+"work_experience int",
+"work_start_time BIGINT"
+]
+
+def refactor(df):
+    def deal_obj(obj, k, kk):
+        if not isinstance(obj, type({})):
+            return ""
+        obj = obj.get(k, {})
+        if not isinstance(obj, type({})):
+            return ""
+        return obj.get(kk, "")
+
+    def loadjson(line):
+        try:
+            return json.loads(line)
+        except Exception:
+            pass
+        return {}
+
+    df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
+    df.fillna("", inplace=True)
+
+    clms = ["tob_resume_id", "updated_at"]
+
+    def extract(nms, cc=None):
+        nonlocal clms
+        clms.extend(nms)
+        for c in nms:
+            if cc:
+                df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
+            else:
+                df[c] = df["obj"].map(
+                    lambda x: json.dumps(
+                        x.get(
+                            c,
+                            {}),
+                        ensure_ascii=False) if isinstance(
+                        x,
+                        type(
+                            {})) and (
+                        isinstance(
+                            x.get(c),
+                            type(
+                                {})) or not x.get(c)) else str(x).replace(
+                                    "None",
+                        ""))
+
+    extract(["education", "work", "certificate", "project", "language",
+             "skill"])
+    extract(["wechat", "phone", "is_deleted",
+            "name", "tel", "email"], "contact")
+    extract(["nation", "expect_industry_name", "salary_month",
+             "industry_ids", "is_house", "birth", "annual_salary_from",
+             "annual_salary_to", "card",
+             "expect_salary_to", "expect_salary_from",
+             "expect_position_name", "gender", "city",
+             "is_fertility", "expect_city_names",
+             "political_status", "title_name", "expect_annual_salary",
+             "industry_name", "address", "position_name", "school_name",
+             "corporation_id",
+             "is_oversea", "responsibilities",
+             "work_start_time", "degree", "management_experience",
+             "expect_type", "corporation_type", "scale", "corporation_name",
+             "self_remark", "annual_salary", "work_experience",
+             "discipline_name", "marital", "updated_at"], "basic")
+
+    df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
+    df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
+    df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
+                                                                      str(x).split(",")]))
+    clms.append("industry_names")
+
+    def arr2str(a):
+        if not a:
+            return ""
+        if isinstance(a, list):
+            a = " ".join([str(i) for i in a])
+        return str(a).replace(",", " ")
+
+    df["expect_industry_name"] = df["expect_industry_name"].map(
+        lambda x: arr2str(x))
+    df["gender"] = df["gender"].map(
+        lambda x: "男" if x == 'M' else (
+            "女" if x == 'F' else ""))
+    for c in ["is_fertility", "is_oversea", "is_house",
+              "management_experience", "marital"]:
+        df[c] = df[c].map(
+            lambda x: '是' if x == 'Y' else (
+                '否' if x == 'N' else ""))
+    df["is_management_experience"] = df["management_experience"]
+    df["is_marital"] = df["marital"]
+    clms.extend(["is_management_experience", "is_marital"])
+
+    df.fillna("", inplace=True)
+    for i in range(len(df)):
+        if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
+            df.loc[i, "phone"] = df.loc[i, "tel"].strip()
+
+    for n in ["industry_ids", "management_experience", "marital", "tel"]:
+        for i in range(len(clms)):
+            if clms[i] == n:
+                del clms[i]
+                break
+
+    clms = list(set(clms))
+
+    df = df.reindex(sorted(clms), axis=1)
+    #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
+    for c in clms:
+        df[c] = df[c].map(
+            lambda s: str(s).replace(
+                "\t",
+                " ").replace(
+                "\n",
+                "\\n").replace(
+                "\r",
+                "\\n"))
+    # print(df.values.tolist())
+    return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
@@ -0,0 +1,696 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+import re
+import copy
+import time
+import datetime
+import demjson3
+import traceback
+import signal
+import numpy as np
+from deepdoc.parser.resume.entities import degrees, schools, corporations
+from rag.nlp import rag_tokenizer, surname
+from xpinyin import Pinyin
+from contextlib import contextmanager
+
+
+class TimeoutException(Exception):
+    pass
+
+
+@contextmanager
+def time_limit(seconds):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+
+    signal.signal(signal.SIGALRM, signal_handler)
+    signal.alarm(seconds)
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+
+
+ENV = None
+PY = Pinyin()
+
+
+def rmHtmlTag(line):
+    return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
+
+
+def highest_degree(dg):
+    if not dg:
+        return ""
+    if isinstance(dg, str):
+        dg = [dg]
+    m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
+    return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
+
+
+def forEdu(cv):
+    if not cv.get("education_obj"):
+        cv["integerity_flt"] *= 0.8
+        return cv
+
+    first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
+    edu_nst = []
+    edu_end_dt = ""
+    cv["school_rank_int"] = 1000000
+    for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
+        e = {}
+        if n.get("end_time"):
+            if n["end_time"] > edu_end_dt:
+                edu_end_dt = n["end_time"]
+            try:
+                dt = n["end_time"]
+                if re.match(r"[0-9]{9,}", dt):
+                    dt = turnTm2Dt(dt)
+                y, m, d = getYMD(dt)
+                ed_dt.append(str(y))
+                e["end_dt_kwd"] = str(y)
+            except Exception as e:
+                pass
+        if n.get("start_time"):
+            try:
+                dt = n["start_time"]
+                if re.match(r"[0-9]{9,}", dt):
+                    dt = turnTm2Dt(dt)
+                y, m, d = getYMD(dt)
+                st_dt.append(str(y))
+                e["start_dt_kwd"] = str(y)
+            except Exception:
+                pass
+
+        r = schools.select(n.get("school_name", ""))
+        if r:
+            if str(r.get("type", "")) == "1":
+                fea.append("211")
+            if str(r.get("type", "")) == "2":
+                fea.append("211")
+            if str(r.get("is_abroad", "")) == "1":
+                fea.append("留学")
+            if str(r.get("is_double_first", "")) == "1":
+                fea.append("双一流")
+            if str(r.get("is_985", "")) == "1":
+                fea.append("985")
+            if str(r.get("is_world_known", "")) == "1":
+                fea.append("海外知名")
+            if r.get("rank") and cv["school_rank_int"] > r["rank"]:
+                cv["school_rank_int"] = r["rank"]
+
+        if n.get("school_name") and isinstance(n["school_name"], str):
+            sch.append(re.sub(r"(211|985|重点大学|[,&;；-])", "", n["school_name"]))
+            e["sch_nm_kwd"] = sch[-1]
+        fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
+
+        if n.get("discipline_name") and isinstance(n["discipline_name"], str):
+            maj.append(n["discipline_name"])
+            e["major_kwd"] = n["discipline_name"]
+
+        if not n.get("degree") and "985" in fea and not first_fea:
+            n["degree"] = "1"
+
+        if n.get("degree"):
+            d = degrees.get_name(n["degree"])
+            if d:
+                e["degree_kwd"] = d
+            if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
+                d = "专升本"
+            if d:
+                deg.append(d)
+
+            # for first degree
+            if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
+                fdeg = [d]
+                if n.get("school_name"):
+                    fsch = [n["school_name"]]
+                if n.get("discipline_name"):
+                    fmaj = [n["discipline_name"]]
+                first_fea = copy.deepcopy(fea)
+
+        edu_nst.append(e)
+
+    cv["sch_rank_kwd"] = []
+    if cv["school_rank_int"] <= 20 \
+            or ("海外名校" in fea and cv["school_rank_int"] <= 200):
+        cv["sch_rank_kwd"].append("顶尖学校")
+    elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
+            or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
+                cv["school_rank_int"] > 200):
+        cv["sch_rank_kwd"].append("精英学校")
+    elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
+            or ("海外名校" in fea and cv["school_rank_int"] > 500):
+        cv["sch_rank_kwd"].append("优质学校")
+    else:
+        cv["sch_rank_kwd"].append("一般学校")
+
+    if edu_nst:
+        cv["edu_nst"] = edu_nst
+    if fea:
+        cv["edu_fea_kwd"] = list(set(fea))
+    if first_fea:
+        cv["edu_first_fea_kwd"] = list(set(first_fea))
+    if maj:
+        cv["major_kwd"] = maj
+    if fsch:
+        cv["first_school_name_kwd"] = fsch
+    if fdeg:
+        cv["first_degree_kwd"] = fdeg
+    if fmaj:
+        cv["first_major_kwd"] = fmaj
+    if st_dt:
+        cv["edu_start_kwd"] = st_dt
+    if ed_dt:
+        cv["edu_end_kwd"] = ed_dt
+    if ed_dt:
+        cv["edu_end_int"] = max([int(t) for t in ed_dt])
+    if deg:
+        if "本科" in deg and "专科" in deg:
+            deg.append("专升本")
+            deg = [d for d in deg if d != '本科']
+        cv["degree_kwd"] = deg
+        cv["highest_degree_kwd"] = highest_degree(deg)
+    if edu_end_dt:
+        try:
+            if re.match(r"[0-9]{9,}", edu_end_dt):
+                edu_end_dt = turnTm2Dt(edu_end_dt)
+            if edu_end_dt.strip("\n") == "至今":
+                edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
+            y, m, d = getYMD(edu_end_dt)
+            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
+        except Exception as e:
+            logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
+    if sch:
+        cv["school_name_kwd"] = sch
+        if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
+                or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
+                or not cv.get("degree_kwd"):
+            for c in sch:
+                if schools.is_good(c):
+                    if "tag_kwd" not in cv:
+                        cv["tag_kwd"] = []
+                    cv["tag_kwd"].append("好学校")
+                    cv["tag_kwd"].append("好学历")
+                    break
+        if (len(cv.get("degree_kwd", [])) >= 1 and \
+            "本科" in cv["degree_kwd"] and \
+            any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
+                or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
+                or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
+            if "tag_kwd" not in cv:
+                cv["tag_kwd"] = []
+            if "好学历" not in cv["tag_kwd"]:
+                cv["tag_kwd"].append("好学历")
+
+    if cv.get("major_kwd"):
+        cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
+    if cv.get("school_name_kwd"):
+        cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
+    if cv.get("first_school_name_kwd"):
+        cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
+    if cv.get("first_major_kwd"):
+        cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
+
+    return cv
+
+
+def forProj(cv):
+    if not cv.get("project_obj"):
+        return cv
+
+    pro_nms, desc = [], []
+    for i, n in enumerate(
+            sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
+                   reverse=True)):
+        if n.get("name"):
+            pro_nms.append(n["name"])
+        if n.get("describe"):
+            desc.append(str(n["describe"]))
+        if n.get("responsibilities"):
+            desc.append(str(n["responsibilities"]))
+        if n.get("achivement"):
+            desc.append(str(n["achivement"]))
+
+    if pro_nms:
+        # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
+        cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
+    if desc:
+        cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
+        cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
+
+    return cv
+
+
+def json_loads(line):
+    return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
+
+
+def forWork(cv):
+    if not cv.get("work_obj"):
+        cv["integerity_flt"] *= 0.7
+        return cv
+
+    flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
+            "industry_name", "subordinates_count"]
+    duas = []
+    scales = []
+    fea = {c: [] for c in flds}
+    latest_job_tm = ""
+    goodcorp = False
+    goodcorp_ = False
+    work_st_tm = ""
+    corp_tags = []
+    for i, n in enumerate(
+            sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
+                   reverse=True)):
+        if isinstance(n, str):
+            try:
+                n = json_loads(n)
+            except Exception:
+                continue
+
+        if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
+            work_st_tm = n["start_time"]
+        for c in flds:
+            if not n.get(c) or str(n[c]) == '0':
+                fea[c].append("")
+                continue
+            if c == "corporation_name":
+                n[c] = corporations.corpNorm(n[c], False)
+                if corporations.is_good(n[c]):
+                    if i == 0:
+                        goodcorp = True
+                    else:
+                        goodcorp_ = True
+                ct = corporations.corp_tag(n[c])
+                if i == 0:
+                    corp_tags.extend(ct)
+                elif ct and ct[0] != "软外":
+                    corp_tags.extend([f"{t}(曾)" for t in ct])
+
+            fea[c].append(rmHtmlTag(str(n[c]).lower()))
+
+        y, m, d = getYMD(n.get("start_time"))
+        if not y or not m:
+            continue
+        st = "%s-%02d-%02d" % (y, int(m), int(d))
+        latest_job_tm = st
+
+        y, m, d = getYMD(n.get("end_time"))
+        if (not y or not m) and i > 0:
+            continue
+        if not y or not m or int(y) > 2022:
+            y, m, d = getYMD(str(n.get("updated_at", "")))
+        if not y or not m:
+            continue
+        ed = "%s-%02d-%02d" % (y, int(m), int(d))
+
+        try:
+            duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
+        except Exception:
+            logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
+
+        if n.get("scale"):
+            r = re.search(r"^([0-9]+)", str(n["scale"]))
+            if r:
+                scales.append(int(r.group(1)))
+
+    if goodcorp:
+        if "tag_kwd" not in cv:
+            cv["tag_kwd"] = []
+        cv["tag_kwd"].append("好公司")
+    if goodcorp_:
+        if "tag_kwd" not in cv:
+            cv["tag_kwd"] = []
+        cv["tag_kwd"].append("好公司(曾)")
+
+    if corp_tags:
+        if "tag_kwd" not in cv:
+            cv["tag_kwd"] = []
+        cv["tag_kwd"].extend(corp_tags)
+        cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
+
+    if latest_job_tm:
+        cv["latest_job_dt"] = latest_job_tm
+    if fea["corporation_id"]:
+        cv["corporation_id"] = fea["corporation_id"]
+
+    if fea["position_name"]:
+        cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
+        cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
+        cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
+
+    if fea["industry_name"]:
+        cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
+        cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
+        cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
+
+    if fea["corporation_name"]:
+        cv["corporation_name_kwd"] = fea["corporation_name"][0]
+        cv["corp_nm_kwd"] = fea["corporation_name"]
+        cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
+        cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
+        cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
+
+    if fea["responsibilities"]:
+        cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
+        cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
+
+    if fea["subordinates_count"]:
+        fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
+                                                               re.match(r"[^0-9]+$", str(i))]
+    if fea["subordinates_count"]:
+        cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
+
+    if isinstance(cv.get("corporation_id"), int):
+        cv["corporation_id"] = [str(cv["corporation_id"])]
+    if not cv.get("corporation_id"):
+        cv["corporation_id"] = []
+    for i in cv.get("corporation_id", []):
+        cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
+
+    if work_st_tm:
+        try:
+            if re.match(r"[0-9]{9,}", work_st_tm):
+                work_st_tm = turnTm2Dt(work_st_tm)
+            y, m, d = getYMD(work_st_tm)
+            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
+        except Exception as e:
+            logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
+
+    cv["job_num_int"] = 0
+    if duas:
+        cv["dua_flt"] = np.mean(duas)
+        cv["cur_dua_int"] = duas[0]
+        cv["job_num_int"] = len(duas)
+    if scales:
+        cv["scale_flt"] = np.max(scales)
+    return cv
+
+
+def turnTm2Dt(b):
+    if not b:
+        return
+    b = str(b).strip()
+    if re.match(r"[0-9]{10,}", b):
+        b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
+    return b
+
+
+def getYMD(b):
+    y, m, d = "", "", "01"
+    if not b:
+        return (y, m, d)
+    b = turnTm2Dt(b)
+    if re.match(r"[0-9]{4}", b):
+        y = int(b[:4])
+    r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
+    if r:
+        m = r.group(1)
+    r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
+    if r:
+        d = r.group(1)
+    if not d or int(d) == 0 or int(d) > 31:
+        d = "1"
+    if not m or int(m) > 12 or int(m) < 1:
+        m = "1"
+    return (y, m, d)
+
+
+def birth(cv):
+    if not cv.get("birth"):
+        cv["integerity_flt"] *= 0.9
+        return cv
+    y, m, d = getYMD(cv["birth"])
+    if not m or not y:
+        return cv
+    b = "%s-%02d-%02d" % (y, int(m), int(d))
+    cv["birth_dt"] = b
+    cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
+
+    cv["age_int"] = datetime.datetime.now().year - int(y)
+    return cv
+
+
+def parse(cv):
+    for k in cv.keys():
+        if cv[k] == '\\N':
+            cv[k] = ''
+    # cv = cv.asDict()
+    tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
+               "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
+               "position_name", "school_name", "self_remark", "title_name"]
+    small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
+    kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
+               "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
+               "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
+    num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
+               "expect_salary_to", "salary_month"]
+
+    is_fld = [
+        ("is_fertility", "已育", "未育"),
+        ("is_house", "有房", "没房"),
+        ("is_management_experience", "有管理经验", "无管理经验"),
+        ("is_marital", "已婚", "未婚"),
+        ("is_oversea", "有海外经验", "无海外经验")
+    ]
+
+    rmkeys = []
+    for k in cv.keys():
+        if cv[k] is None:
+            rmkeys.append(k)
+        if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
+            rmkeys.append(k)
+    for k in rmkeys:
+        del cv[k]
+
+    integerity = 0.
+    flds_num = 0.
+
+    def hasValues(flds):
+        nonlocal integerity, flds_num
+        flds_num += len(flds)
+        for f in flds:
+            v = str(cv.get(f, ""))
+            if len(v) > 0 and v != '0' and v != '[]':
+                integerity += 1
+
+    hasValues(tks_fld)
+    hasValues(small_tks_fld)
+    hasValues(kwd_fld)
+    hasValues(num_fld)
+    cv["integerity_flt"] = integerity / flds_num
+
+    if cv.get("corporation_type"):
+        for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
+                     (r"[／/．·　<\(（]+.*", ""),
+                     (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
+                     (r".*(机关|事业).*", "机关"),
+                     (r".*(非盈利|Non-profit).*", "非盈利"),
+                     (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
+                     (r".*国有.*", "国企"),
+                     (r"[ （）\(\)人/·0-9-]+", ""),
+                     (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
+            cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
+        if len(cv["corporation_type"]) < 2:
+            del cv["corporation_type"]
+
+    if cv.get("political_status"):
+        for p, r in [
+            (r".*党员.*", "党员"),
+            (r".*(无党派|公民).*", "群众"),
+            (r".*团员.*", "团员")]:
+            cv["political_status"] = re.sub(p, r, cv["political_status"])
+        if not re.search(r"[党团群]", cv["political_status"]):
+            del cv["political_status"]
+
+    if cv.get("phone"):
+        cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
+
+    keys = list(cv.keys())
+    for k in keys:
+        # deal with json objects
+        if k.find("_obj") > 0:
+            try:
+                cv[k] = json_loads(cv[k])
+                cv[k] = [a for _, a in cv[k].items()]
+                nms = []
+                for n in cv[k]:
+                    if not isinstance(n, dict) or "name" not in n or not n.get("name"):
+                        continue
+                    n["name"] = re.sub(r"(（442）|\t )", "", n["name"]).strip().lower()
+                    if not n["name"]:
+                        continue
+                    nms.append(n["name"])
+                if nms:
+                    t = k[:-4]
+                    cv[f"{t}_kwd"] = nms
+                    cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
+            except Exception:
+                logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
+                cv[k] = []
+
+        # tokenize fields
+        if k in tks_fld:
+            cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
+            if k in small_tks_fld:
+                cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
+
+        # keyword fields
+        if k in kwd_fld:
+            cv[f"{k}_kwd"] = [n.lower()
+                                           for n in re.split(r"[\t,，；;. ]",
+                                                             re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1，\2", cv[k])
+                                                             ) if n]
+
+        if k in num_fld and cv.get(k):
+            cv[f"{k}_int"] = cv[k]
+
+    cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
+    # for name field
+    if cv.get("name"):
+        nm = re.sub(r"[\n——\-\(（\+].*", "", cv["name"].strip())
+        nm = re.sub(r"[ \t　]+", " ", nm)
+        if re.match(r"[a-zA-Z ]+$", nm):
+            if len(nm.split()) > 1:
+                cv["name"] = nm
+            else:
+                nm = ""
+        elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
+            nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
+        else:
+            nm = ""
+        cv["name"] = nm.strip()
+        name = cv["name"]
+
+        # name pingyin and its prefix
+        cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
+        cv["name_py_pref0_tks"] = ""
+        cv["name_py_pref_tks"] = ""
+        for py in PY.get_pinyins(nm[:20], ''):
+            for i in range(2, len(py) + 1):
+                cv["name_py_pref_tks"] += " " + py[:i]
+        for py in PY.get_pinyins(nm[:20], ' '):
+            py = py.split()
+            for i in range(1, len(py) + 1):
+                cv["name_py_pref0_tks"] += " " + "".join(py[:i])
+
+        cv["name_kwd"] = name
+        cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
+        cv["name_tks"] = (
+                rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
+        ) if name else ""
+    else:
+        cv["integerity_flt"] /= 2.
+
+    if cv.get("phone"):
+        r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
+        if not r:
+            cv["phone"] = ""
+        else:
+            cv["phone"] = r.group(1)
+
+    # deal with date  fields
+    if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
+        cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
+    else:
+        y, m, d = getYMD(str(cv.get("updated_at", "")))
+        if not y:
+            y = "2012"
+        if not m:
+            m = "01"
+        if not d:
+            d = "01"
+        cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
+        # long text tokenize
+
+    if cv.get("responsibilities"):
+        cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
+
+    # for yes or no field
+    fea = []
+    for f, y, n in is_fld:
+        if f not in cv:
+            continue
+        if cv[f] == '是':
+            fea.append(y)
+        if cv[f] == '否':
+            fea.append(n)
+
+    if fea:
+        cv["tag_kwd"] = fea
+
+    cv = forEdu(cv)
+    cv = forProj(cv)
+    cv = forWork(cv)
+    cv = birth(cv)
+
+    cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
+    for i in range(len(cv["corp_proj_sch_deg_kwd"])):
+        for j in cv.get("sch_rank_kwd", []):
+            cv["corp_proj_sch_deg_kwd"][i] += "+" + j
+    for i in range(len(cv["corp_proj_sch_deg_kwd"])):
+        if cv.get("highest_degree_kwd"):
+            cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
+
+    try:
+        if not cv.get("work_exp_flt") and cv.get("work_start_time"):
+            if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
+                cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
+                cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
+            elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
+                y, m, d = getYMD(str(cv["work_start_time"]))
+                cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
+                cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
+    except Exception as e:
+        logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
+    if "work_exp_flt" not in cv and cv.get("work_experience", 0):
+        cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
+
+    keys = list(cv.keys())
+    for k in keys:
+        if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
+            del cv[k]
+    for k in cv.keys():
+        if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
+            continue
+        cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
+    keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
+    for k in keys:
+        if cv[k] <= 0:
+            del cv[k]
+
+    cv["tob_resume_id"] = str(cv["tob_resume_id"])
+    cv["id"] = cv["tob_resume_id"]
+    logging.debug("CCCCCCCCCCCCCCC")
+
+    return dealWithInt64(cv)
+
+
+def dealWithInt64(d):
+    if isinstance(d, dict):
+        for n, v in d.items():
+            d[n] = dealWithInt64(v)
+
+    if isinstance(d, list):
+        d = [dealWithInt64(t) for t in d]
+
+    if isinstance(d, np.integer):
+        d = int(d)
+    return d
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
@@ -0,0 +1,64 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import re
+
+from deepdoc.parser.utils import get_text
+from rag.nlp import num_tokens_from_string
+
+
+class RAGFlowTxtParser:
+    def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。；！？"):
+        txt = get_text(fnm, binary)
+        return self.parser_txt(txt, chunk_token_num, delimiter)
+
+    @classmethod
+    def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。；！？"):
+        if not isinstance(txt, str):
+            raise TypeError("txt type should be str!")
+        cks = [""]
+        tk_nums = [0]
+        delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
+
+        def add_chunk(t):
+            nonlocal cks, tk_nums, delimiter
+            tnum = num_tokens_from_string(t)
+            if tk_nums[-1] > chunk_token_num:
+                cks.append(t)
+                tk_nums.append(tnum)
+            else:
+                cks[-1] += t
+                tk_nums[-1] += tnum
+
+        dels = []
+        s = 0
+        for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
+            f, t = m.span()
+            dels.append(m.group(1))
+            dels.extend(list(delimiter[s: f]))
+            s = t
+        if s < len(delimiter):
+            dels.extend(list(delimiter[s:]))
+        dels = [re.escape(d) for d in dels if d]
+        dels = [d for d in dels if d]
+        dels = "|".join(dels)
+        secs = re.split(r"(%s)" % dels, txt)
+        for sec in secs:
+            if re.match(f"^{dels}$", sec):
+                continue
+            add_chunk(sec)
+
+        return [[c, ""] for c in cks]
--- a/deepdoc/parser/utils.py
+++ b/deepdoc/parser/utils.py
@@ -0,0 +1,32 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from rag.nlp import find_codec
+
+
+def get_text(fnm: str, binary=None) -> str:
+    txt = ""
+    if binary:
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+    else:
+        with open(fnm, "r") as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
+                txt += line
+    return txt