将flask改成fastapi
This commit is contained in:
40
deepdoc/parser/__init__.py
Normal file
40
deepdoc/parser/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||
from .markdown_parser import MarkdownElementExtractor
|
||||
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
||||
from .pdf_parser import PlainParser
|
||||
from .pdf_parser import RAGFlowPdfParser as PdfParser
|
||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
from .txt_parser import RAGFlowTxtParser as TxtParser
|
||||
|
||||
__all__ = [
|
||||
"PdfParser",
|
||||
"PlainParser",
|
||||
"DocxParser",
|
||||
"ExcelParser",
|
||||
"PptParser",
|
||||
"HtmlParser",
|
||||
"JsonParser",
|
||||
"MarkdownParser",
|
||||
"TxtParser",
|
||||
"MarkdownElementExtractor",
|
||||
]
|
||||
|
||||
139
deepdoc/parser/docx_parser.py
Normal file
139
deepdoc/parser/docx_parser.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from docx import Document
|
||||
import re
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from rag.nlp import rag_tokenizer
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class RAGFlowDocxParser:
|
||||
|
||||
def __extract_table_content(self, tb):
|
||||
df = []
|
||||
for row in tb.rows:
|
||||
df.append([c.text for c in row.cells])
|
||||
return self.__compose_table_content(pd.DataFrame(df))
|
||||
|
||||
def __compose_table_content(self, df):
|
||||
|
||||
def blockType(b):
|
||||
pattern = [
|
||||
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
||||
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||||
(r"^第*[一二三四1-4]季度$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
|
||||
("^[0-9.,+%/ -]+$", "Nu"),
|
||||
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
||||
(r"^[A-Z]*[a-z' -]+$", "En"),
|
||||
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
||||
(r"^.{1}$", "Sg")
|
||||
]
|
||||
for p, n in pattern:
|
||||
if re.search(p, b):
|
||||
return n
|
||||
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
||||
if len(tks) > 3:
|
||||
if len(tks) < 12:
|
||||
return "Tx"
|
||||
else:
|
||||
return "Lx"
|
||||
|
||||
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||
return "Nr"
|
||||
|
||||
return "Ot"
|
||||
|
||||
if len(df) < 2:
|
||||
return []
|
||||
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
|
||||
1, len(df)) for j in range(len(df.iloc[i, :]))])
|
||||
max_type = max(max_type.items(), key=lambda x: x[1])[0]
|
||||
|
||||
colnm = len(df.iloc[0, :])
|
||||
hdrows = [0] # header is not necessarily appear in the first line
|
||||
if max_type == "Nu":
|
||||
for r in range(1, len(df)):
|
||||
tys = Counter([blockType(str(df.iloc[r, j]))
|
||||
for j in range(len(df.iloc[r, :]))])
|
||||
tys = max(tys.items(), key=lambda x: x[1])[0]
|
||||
if tys != max_type:
|
||||
hdrows.append(r)
|
||||
|
||||
lines = []
|
||||
for i in range(1, len(df)):
|
||||
if i in hdrows:
|
||||
continue
|
||||
hr = [r - i for r in hdrows]
|
||||
hr = [r for r in hr if r < 0]
|
||||
t = len(hr) - 1
|
||||
while t > 0:
|
||||
if hr[t] - hr[t - 1] > 1:
|
||||
hr = hr[t:]
|
||||
break
|
||||
t -= 1
|
||||
headers = []
|
||||
for j in range(len(df.iloc[i, :])):
|
||||
t = []
|
||||
for h in hr:
|
||||
x = str(df.iloc[i + h, j]).strip()
|
||||
if x in t:
|
||||
continue
|
||||
t.append(x)
|
||||
t = ",".join(t)
|
||||
if t:
|
||||
t += ": "
|
||||
headers.append(t)
|
||||
cells = []
|
||||
for j in range(len(df.iloc[i, :])):
|
||||
if not str(df.iloc[i, j]):
|
||||
continue
|
||||
cells.append(headers[j] + str(df.iloc[i, j]))
|
||||
lines.append(";".join(cells))
|
||||
|
||||
if colnm > 3:
|
||||
return lines
|
||||
return ["\n".join(lines)]
|
||||
|
||||
def __call__(self, fnm, from_page=0, to_page=100000000):
|
||||
self.doc = Document(fnm) if isinstance(
|
||||
fnm, str) else Document(BytesIO(fnm))
|
||||
pn = 0 # parsed page
|
||||
secs = [] # parsed contents
|
||||
for p in self.doc.paragraphs:
|
||||
if pn > to_page:
|
||||
break
|
||||
|
||||
runs_within_single_paragraph = [] # save runs within the range of pages
|
||||
for run in p.runs:
|
||||
if pn > to_page:
|
||||
break
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
runs_within_single_paragraph.append(run.text) # append run.text first
|
||||
|
||||
# wrap page break checker into a static method
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
pn += 1
|
||||
|
||||
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
|
||||
|
||||
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
||||
return secs, tbls
|
||||
189
deepdoc/parser/excel_parser.py
Normal file
189
deepdoc/parser/excel_parser.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook, load_workbook
|
||||
|
||||
from rag.nlp import find_codec
|
||||
|
||||
# copied from `/openpyxl/cell/cell.py`
|
||||
ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
|
||||
|
||||
|
||||
class RAGFlowExcelParser:
|
||||
@staticmethod
|
||||
def _load_excel_to_workbook(file_like_object):
|
||||
if isinstance(file_like_object, bytes):
|
||||
file_like_object = BytesIO(file_like_object)
|
||||
|
||||
# Read first 4 bytes to determine file type
|
||||
file_like_object.seek(0)
|
||||
file_head = file_like_object.read(4)
|
||||
file_like_object.seek(0)
|
||||
|
||||
if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")):
|
||||
logging.info("Not an Excel file, converting CSV to Excel Workbook")
|
||||
|
||||
try:
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_csv(file_like_object)
|
||||
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||
|
||||
except Exception as e_csv:
|
||||
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
|
||||
|
||||
try:
|
||||
return load_workbook(file_like_object, data_only=True)
|
||||
except Exception as e:
|
||||
logging.info(f"openpyxl load error: {e}, try pandas instead")
|
||||
try:
|
||||
file_like_object.seek(0)
|
||||
try:
|
||||
df = pd.read_excel(file_like_object)
|
||||
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||
except Exception as ex:
|
||||
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_excel(file_like_object, engine="calamine")
|
||||
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||
except Exception as e_pandas:
|
||||
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
||||
|
||||
@staticmethod
|
||||
def _clean_dataframe(df: pd.DataFrame):
|
||||
def clean_string(s):
|
||||
if isinstance(s, str):
|
||||
return ILLEGAL_CHARACTERS_RE.sub(" ", s)
|
||||
return s
|
||||
|
||||
return df.apply(lambda col: col.map(clean_string))
|
||||
|
||||
@staticmethod
|
||||
def _dataframe_to_workbook(df):
|
||||
df = RAGFlowExcelParser._clean_dataframe(df)
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Data"
|
||||
|
||||
for col_num, column_name in enumerate(df.columns, 1):
|
||||
ws.cell(row=1, column=col_num, value=column_name)
|
||||
|
||||
for row_num, row in enumerate(df.values, 2):
|
||||
for col_num, value in enumerate(row, 1):
|
||||
ws.cell(row=row_num, column=col_num, value=value)
|
||||
|
||||
return wb
|
||||
|
||||
def html(self, fnm, chunk_rows=256):
|
||||
from html import escape
|
||||
|
||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
||||
tb_chunks = []
|
||||
|
||||
def _fmt(v):
|
||||
if v is None:
|
||||
return ""
|
||||
return str(v).strip()
|
||||
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
rows = list(ws.rows)
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
tb_rows_0 = "<tr>"
|
||||
for t in list(rows[0]):
|
||||
tb_rows_0 += f"<th>{escape(_fmt(t.value))}</th>"
|
||||
tb_rows_0 += "</tr>"
|
||||
|
||||
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
||||
tb = ""
|
||||
tb += f"<table><caption>{sheetname}</caption>"
|
||||
tb += tb_rows_0
|
||||
for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]):
|
||||
tb += "<tr>"
|
||||
for i, c in enumerate(r):
|
||||
if c.value is None:
|
||||
tb += "<td></td>"
|
||||
else:
|
||||
tb += f"<td>{escape(_fmt(c.value))}</td>"
|
||||
tb += "</tr>"
|
||||
tb += "</table>\n"
|
||||
tb_chunks.append(tb)
|
||||
|
||||
return tb_chunks
|
||||
|
||||
def markdown(self, fnm):
|
||||
import pandas as pd
|
||||
|
||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||
try:
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_excel(file_like_object)
|
||||
except Exception as e:
|
||||
logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file")
|
||||
file_like_object.seek(0)
|
||||
df = pd.read_csv(file_like_object)
|
||||
df = df.replace(r"^\s*$", "", regex=True)
|
||||
return df.to_markdown(index=False)
|
||||
|
||||
def __call__(self, fnm):
|
||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
||||
|
||||
res = []
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
rows = list(ws.rows)
|
||||
if not rows:
|
||||
continue
|
||||
ti = list(rows[0])
|
||||
for r in list(rows[1:]):
|
||||
fields = []
|
||||
for i, c in enumerate(r):
|
||||
if not c.value:
|
||||
continue
|
||||
t = str(ti[i].value) if i < len(ti) else ""
|
||||
t += (":" if t else "") + str(c.value)
|
||||
fields.append(t)
|
||||
line = "; ".join(fields)
|
||||
if sheetname.lower().find("sheet") < 0:
|
||||
line += " ——" + sheetname
|
||||
res.append(line)
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def row_number(fnm, binary):
|
||||
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
||||
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
|
||||
total = 0
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
total += len(list(ws.rows))
|
||||
return total
|
||||
|
||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
return len(txt.split("\n"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
psr = RAGFlowExcelParser()
|
||||
psr(sys.argv[1])
|
||||
105
deepdoc/parser/figure_parser.py
Normal file
105
deepdoc/parser/figure_parser.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from api.utils.api_utils import timeout
|
||||
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
|
||||
from rag.prompts.generator import vision_llm_figure_describe_prompt
|
||||
|
||||
|
||||
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
|
||||
return [
|
||||
(
|
||||
(figure_data[1], [figure_data[0]]),
|
||||
[(0, 0, 0, 0, 0)],
|
||||
)
|
||||
for figure_data in figures_data_without_positions
|
||||
if isinstance(figure_data[1], Image.Image)
|
||||
]
|
||||
|
||||
|
||||
shared_executor = ThreadPoolExecutor(max_workers=10)
|
||||
|
||||
|
||||
class VisionFigureParser:
|
||||
def __init__(self, vision_model, figures_data, *args, **kwargs):
|
||||
self.vision_model = vision_model
|
||||
self._extract_figures_info(figures_data)
|
||||
assert len(self.figures) == len(self.descriptions)
|
||||
assert not self.positions or (len(self.figures) == len(self.positions))
|
||||
|
||||
def _extract_figures_info(self, figures_data):
|
||||
self.figures = []
|
||||
self.descriptions = []
|
||||
self.positions = []
|
||||
|
||||
for item in figures_data:
|
||||
# position
|
||||
if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
|
||||
img_desc = item[0]
|
||||
assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
|
||||
self.figures.append(img_desc[0])
|
||||
self.descriptions.append(img_desc[1])
|
||||
self.positions.append(item[1])
|
||||
else:
|
||||
assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}"
|
||||
self.figures.append(item[0])
|
||||
self.descriptions.append(item[1])
|
||||
|
||||
def _assemble(self):
|
||||
self.assembled = []
|
||||
self.has_positions = len(self.positions) != 0
|
||||
for i in range(len(self.figures)):
|
||||
figure = self.figures[i]
|
||||
desc = self.descriptions[i]
|
||||
pos = self.positions[i] if self.has_positions else None
|
||||
|
||||
figure_desc = (figure, desc)
|
||||
|
||||
if pos is not None:
|
||||
self.assembled.append((figure_desc, pos))
|
||||
else:
|
||||
self.assembled.append((figure_desc,))
|
||||
|
||||
return self.assembled
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
callback = kwargs.get("callback", lambda prog, msg: None)
|
||||
|
||||
@timeout(30, 3)
|
||||
def process(figure_idx, figure_binary):
|
||||
description_text = picture_vision_llm_chunk(
|
||||
binary=figure_binary,
|
||||
vision_model=self.vision_model,
|
||||
prompt=vision_llm_figure_describe_prompt(),
|
||||
callback=callback,
|
||||
)
|
||||
return figure_idx, description_text
|
||||
|
||||
futures = []
|
||||
for idx, img_binary in enumerate(self.figures or []):
|
||||
futures.append(shared_executor.submit(process, idx, img_binary))
|
||||
|
||||
for future in as_completed(futures):
|
||||
figure_num, txt = future.result()
|
||||
if txt:
|
||||
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
|
||||
|
||||
self._assemble()
|
||||
|
||||
return self.assembled
|
||||
214
deepdoc/parser/html_parser.py
Normal file
214
deepdoc/parser/html_parser.py
Normal file
@@ -0,0 +1,214 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from rag.nlp import find_codec, rag_tokenizer
|
||||
import uuid
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||
import html
|
||||
|
||||
def get_encoding(file):
|
||||
with open(file,'rb') as f:
|
||||
tmp = chardet.detect(f.read())
|
||||
return tmp['encoding']
|
||||
|
||||
BLOCK_TAGS = [
|
||||
"h1", "h2", "h3", "h4", "h5", "h6",
|
||||
"p", "div", "article", "section", "aside",
|
||||
"ul", "ol", "li",
|
||||
"table", "pre", "code", "blockquote",
|
||||
"figure", "figcaption"
|
||||
]
|
||||
TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
|
||||
|
||||
|
||||
class RAGFlowHtmlParser:
|
||||
def __call__(self, fnm, binary=None, chunk_token_num=512):
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
||||
txt = f.read()
|
||||
return self.parser_txt(txt, chunk_token_num)
|
||||
|
||||
@classmethod
|
||||
def parser_txt(cls, txt, chunk_token_num):
|
||||
if not isinstance(txt, str):
|
||||
raise TypeError("txt type should be string!")
|
||||
|
||||
temp_sections = []
|
||||
soup = BeautifulSoup(txt, "html5lib")
|
||||
# delete <style> tag
|
||||
for style_tag in soup.find_all(["style", "script"]):
|
||||
style_tag.decompose()
|
||||
# delete <script> tag in <div>
|
||||
for div_tag in soup.find_all("div"):
|
||||
for script_tag in div_tag.find_all("script"):
|
||||
script_tag.decompose()
|
||||
# delete inline style
|
||||
for tag in soup.find_all(True):
|
||||
if 'style' in tag.attrs:
|
||||
del tag.attrs['style']
|
||||
# delete HTML comment
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
|
||||
block_txt_list, table_list = cls.merge_block_text(temp_sections)
|
||||
sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
|
||||
for table in table_list:
|
||||
sections.append(table.get("content", ""))
|
||||
return sections
|
||||
|
||||
@classmethod
|
||||
def split_table(cls, html_table, chunk_token_num=512):
|
||||
soup = BeautifulSoup(html_table, "html.parser")
|
||||
rows = soup.find_all("tr")
|
||||
tables = []
|
||||
current_table = []
|
||||
current_count = 0
|
||||
table_str_list = []
|
||||
for row in rows:
|
||||
tks_str = rag_tokenizer.tokenize(str(row))
|
||||
token_count = len(tks_str.split(" ")) if tks_str else 0
|
||||
if current_count + token_count > chunk_token_num:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
current_count = 0
|
||||
current_table.append(row)
|
||||
current_count += token_count
|
||||
if current_table:
|
||||
tables.append(current_table)
|
||||
|
||||
for table_rows in tables:
|
||||
new_table = soup.new_tag("table")
|
||||
for row in table_rows:
|
||||
new_table.append(row)
|
||||
table_str_list.append(str(new_table))
|
||||
|
||||
return table_str_list
|
||||
|
||||
@classmethod
|
||||
def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
|
||||
if isinstance(element, NavigableString):
|
||||
content = element.strip()
|
||||
|
||||
def is_valid_html(content):
|
||||
try:
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
return bool(soup.find())
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
return_info = []
|
||||
if content:
|
||||
if is_valid_html(content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
|
||||
parser_result.extend(child_info)
|
||||
else:
|
||||
info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
|
||||
if parent_name:
|
||||
info["tag_name"] = parent_name
|
||||
return_info.append(info)
|
||||
return return_info
|
||||
elif isinstance(element, Tag):
|
||||
|
||||
if str.lower(element.name) == "table":
|
||||
table_info_list = []
|
||||
table_id = str(uuid.uuid1())
|
||||
table_list = [html.unescape(str(element))]
|
||||
for t in table_list:
|
||||
table_info_list.append({"content": t, "tag_name": "table",
|
||||
"metadata": {"table_id": table_id, "index": table_list.index(t)}})
|
||||
return table_info_list
|
||||
else:
|
||||
block_id = None
|
||||
if str.lower(element.name) in BLOCK_TAGS:
|
||||
block_id = str(uuid.uuid1())
|
||||
for child in element.children:
|
||||
child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
|
||||
block_id)
|
||||
parser_result.extend(child_info)
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def merge_block_text(cls, parser_result):
|
||||
block_content = []
|
||||
current_content = ""
|
||||
table_info_list = []
|
||||
lask_block_id = None
|
||||
for item in parser_result:
|
||||
content = item.get("content")
|
||||
tag_name = item.get("tag_name")
|
||||
title_flag = tag_name in TITLE_TAGS
|
||||
block_id = item.get("metadata", {}).get("block_id")
|
||||
if block_id:
|
||||
if title_flag:
|
||||
content = f"{TITLE_TAGS[tag_name]} {content}"
|
||||
if lask_block_id != block_id:
|
||||
if lask_block_id is not None:
|
||||
block_content.append(current_content)
|
||||
current_content = content
|
||||
lask_block_id = block_id
|
||||
else:
|
||||
current_content += (" " if current_content else "") + content
|
||||
else:
|
||||
if tag_name == "table":
|
||||
table_info_list.append(item)
|
||||
else:
|
||||
current_content += (" " if current_content else "" + content)
|
||||
if current_content:
|
||||
block_content.append(current_content)
|
||||
return block_content, table_info_list
|
||||
|
||||
@classmethod
|
||||
def chunk_block(cls, block_txt_list, chunk_token_num=512):
|
||||
chunks = []
|
||||
current_block = ""
|
||||
current_token_count = 0
|
||||
|
||||
for block in block_txt_list:
|
||||
tks_str = rag_tokenizer.tokenize(block)
|
||||
block_token_count = len(tks_str.split(" ")) if tks_str else 0
|
||||
if block_token_count > chunk_token_num:
|
||||
if current_block:
|
||||
chunks.append(current_block)
|
||||
start = 0
|
||||
tokens = tks_str.split(" ")
|
||||
while start < len(tokens):
|
||||
end = start + chunk_token_num
|
||||
split_tokens = tokens[start:end]
|
||||
chunks.append(" ".join(split_tokens))
|
||||
start = end
|
||||
current_block = ""
|
||||
current_token_count = 0
|
||||
else:
|
||||
if current_token_count + block_token_count <= chunk_token_num:
|
||||
current_block += ("\n" if current_block else "") + block
|
||||
current_token_count += block_token_count
|
||||
else:
|
||||
chunks.append(current_block)
|
||||
current_block = block
|
||||
current_token_count = block_token_count
|
||||
|
||||
if current_block:
|
||||
chunks.append(current_block)
|
||||
|
||||
return chunks
|
||||
|
||||
179
deepdoc/parser/json_parser.py
Normal file
179
deepdoc/parser/json_parser.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# The following documents are mainly referenced, and only adaptation modifications have been made
|
||||
# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from rag.nlp import find_codec
|
||||
|
||||
|
||||
class RAGFlowJsonParser:
|
||||
def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
|
||||
super().__init__()
|
||||
self.max_chunk_size = max_chunk_size * 2
|
||||
self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
|
||||
|
||||
def __call__(self, binary):
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
|
||||
if self.is_jsonl_format(txt):
|
||||
sections = self._parse_jsonl(txt)
|
||||
else:
|
||||
sections = self._parse_json(txt)
|
||||
return sections
|
||||
|
||||
@staticmethod
|
||||
def _json_size(data: dict) -> int:
|
||||
"""Calculate the size of the serialized JSON object."""
|
||||
return len(json.dumps(data, ensure_ascii=False))
|
||||
|
||||
@staticmethod
|
||||
def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
|
||||
"""Set a value in a nested dictionary based on the given path."""
|
||||
for key in path[:-1]:
|
||||
d = d.setdefault(key, {})
|
||||
d[path[-1]] = value
|
||||
|
||||
def _list_to_dict_preprocessing(self, data: Any) -> Any:
|
||||
if isinstance(data, dict):
|
||||
# Process each key-value pair in the dictionary
|
||||
return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
|
||||
elif isinstance(data, list):
|
||||
# Convert the list to a dictionary with index-based keys
|
||||
return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
|
||||
else:
|
||||
# Base case: the item is neither a dict nor a list, so return it unchanged
|
||||
return data
|
||||
|
||||
def _json_split(
|
||||
self,
|
||||
data,
|
||||
current_path: list[str] | None,
|
||||
chunks: list[dict] | None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Split json into maximum size dictionaries while preserving structure.
|
||||
"""
|
||||
current_path = current_path or []
|
||||
chunks = chunks or [{}]
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
new_path = current_path + [key]
|
||||
chunk_size = self._json_size(chunks[-1])
|
||||
size = self._json_size({key: value})
|
||||
remaining = self.max_chunk_size - chunk_size
|
||||
|
||||
if size < remaining:
|
||||
# Add item to current chunk
|
||||
self._set_nested_dict(chunks[-1], new_path, value)
|
||||
else:
|
||||
if chunk_size >= self.min_chunk_size:
|
||||
# Chunk is big enough, start a new chunk
|
||||
chunks.append({})
|
||||
|
||||
# Iterate
|
||||
self._json_split(value, new_path, chunks)
|
||||
else:
|
||||
# handle single item
|
||||
self._set_nested_dict(chunks[-1], current_path, data)
|
||||
return chunks
|
||||
|
||||
def split_json(
|
||||
self,
|
||||
json_data,
|
||||
convert_lists: bool = False,
|
||||
) -> list[dict]:
|
||||
"""Splits JSON into a list of JSON chunks"""
|
||||
|
||||
if convert_lists:
|
||||
preprocessed_data = self._list_to_dict_preprocessing(json_data)
|
||||
chunks = self._json_split(preprocessed_data, None, None)
|
||||
else:
|
||||
chunks = self._json_split(json_data, None, None)
|
||||
|
||||
# Remove the last chunk if it's empty
|
||||
if not chunks[-1]:
|
||||
chunks.pop()
|
||||
return chunks
|
||||
|
||||
def split_text(
|
||||
self,
|
||||
json_data: dict[str, Any],
|
||||
convert_lists: bool = False,
|
||||
ensure_ascii: bool = True,
|
||||
) -> list[str]:
|
||||
"""Splits JSON into a list of JSON formatted strings"""
|
||||
|
||||
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
||||
|
||||
# Convert to string
|
||||
return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
|
||||
|
||||
def _parse_json(self, content: str) -> list[str]:
|
||||
sections = []
|
||||
try:
|
||||
json_data = json.loads(content)
|
||||
chunks = self.split_json(json_data, True)
|
||||
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return sections
|
||||
|
||||
def _parse_jsonl(self, content: str) -> list[str]:
|
||||
lines = content.strip().splitlines()
|
||||
all_chunks = []
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
chunks = self.split_json(data, convert_lists=True)
|
||||
all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return all_chunks
|
||||
|
||||
def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
|
||||
lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
|
||||
if not lines:
|
||||
return False
|
||||
|
||||
try:
|
||||
json.loads(txt)
|
||||
return False
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
sample_limit = min(len(lines), sample_limit)
|
||||
sample_lines = lines[:sample_limit]
|
||||
valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
|
||||
|
||||
if not valid_lines:
|
||||
return False
|
||||
|
||||
return (valid_lines / len(sample_lines)) >= threshold
|
||||
|
||||
def _is_valid_json(self, line: str) -> bool:
|
||||
try:
|
||||
json.loads(line)
|
||||
return True
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
273
deepdoc/parser/markdown_parser.py
Normal file
273
deepdoc/parser/markdown_parser.py
Normal file
@@ -0,0 +1,273 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re
|
||||
|
||||
import mistune
|
||||
from markdown import markdown
|
||||
|
||||
|
||||
class RAGFlowMarkdownParser:
|
||||
def __init__(self, chunk_token_num=128):
|
||||
self.chunk_token_num = int(chunk_token_num)
|
||||
|
||||
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
|
||||
tables = []
|
||||
working_text = markdown_text
|
||||
|
||||
def replace_tables_with_rendered_html(pattern, table_list, render=True):
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
table_list.append(raw_table)
|
||||
if separate_tables:
|
||||
# Skip this match (i.e., remove it)
|
||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||
else:
|
||||
# Replace with rendered HTML
|
||||
html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
|
||||
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
return new_text
|
||||
|
||||
if "|" in markdown_text: # for optimize performance
|
||||
# Standard Markdown table
|
||||
border_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
(?:\|.*?\|.*?\|.*?\n)
|
||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||
(?:\|.*?\|.*?\|.*?\n)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
||||
|
||||
# Borderless Markdown table
|
||||
no_border_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
(?:\S.*?\|.*?\n)
|
||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||
(?:\S.*?\|.*?\n)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
||||
|
||||
if "<table>" in working_text.lower(): # for optimize performance
|
||||
# HTML table extraction - handle possible html/body wrapper tags
|
||||
html_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
\s*
|
||||
(?:
|
||||
# case1: <html><body><table>...</table></body></html>
|
||||
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
||||
|
|
||||
# case2: <body><table>...</table></body>
|
||||
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
||||
|
|
||||
# case3: only<table>...</table>
|
||||
(?:<table[^>]*>.*?</table>)
|
||||
)
|
||||
\s*
|
||||
(?=\n|$)
|
||||
""",
|
||||
re.VERBOSE | re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
def replace_html_tables():
|
||||
nonlocal working_text
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in html_table_pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
tables.append(raw_table)
|
||||
if separate_tables:
|
||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||
else:
|
||||
new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
working_text = new_text
|
||||
|
||||
replace_html_tables()
|
||||
|
||||
return working_text, tables
|
||||
|
||||
|
||||
class MarkdownElementExtractor:
|
||||
def __init__(self, markdown_content):
|
||||
self.markdown_content = markdown_content
|
||||
self.lines = markdown_content.split("\n")
|
||||
self.ast_parser = mistune.create_markdown(renderer="ast")
|
||||
self.ast_nodes = self.ast_parser(markdown_content)
|
||||
|
||||
def extract_elements(self):
|
||||
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
||||
sections = []
|
||||
|
||||
i = 0
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
|
||||
if re.match(r"^#{1,6}\s+.*$", line):
|
||||
# header
|
||||
element = self._extract_header(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip().startswith("```"):
|
||||
# code block
|
||||
element = self._extract_code_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
||||
# list block
|
||||
element = self._extract_list_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip().startswith(">"):
|
||||
# blockquote
|
||||
element = self._extract_blockquote(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip():
|
||||
# text block (paragraphs and inline elements until next block element)
|
||||
element = self._extract_text_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
sections = [section for section in sections if section.strip()]
|
||||
return sections
|
||||
|
||||
def _extract_header(self, start_pos):
|
||||
return {
|
||||
"type": "header",
|
||||
"content": self.lines[start_pos],
|
||||
"start_line": start_pos,
|
||||
"end_line": start_pos,
|
||||
}
|
||||
|
||||
def _extract_code_block(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = [self.lines[start_pos]]
|
||||
|
||||
# Find the end of the code block
|
||||
for i in range(start_pos + 1, len(self.lines)):
|
||||
content_lines.append(self.lines[i])
|
||||
end_pos = i
|
||||
if self.lines[i].strip().startswith("```"):
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "code_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_list_block(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = []
|
||||
|
||||
i = start_pos
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
# check if this line is a list item or continuation of a list
|
||||
if (
|
||||
re.match(r"^\s*[-*+]\s+.*$", line)
|
||||
or re.match(r"^\s*\d+\.\s+.*$", line)
|
||||
or (i > start_pos and not line.strip())
|
||||
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
|
||||
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
|
||||
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
|
||||
):
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "list_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_blockquote(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = []
|
||||
|
||||
i = start_pos
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "blockquote",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_text_block(self, start_pos):
|
||||
"""Extract a text block (paragraphs, inline elements) until next block element"""
|
||||
end_pos = start_pos
|
||||
content_lines = [self.lines[start_pos]]
|
||||
|
||||
i = start_pos + 1
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
# stop if we encounter a block element
|
||||
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
|
||||
break
|
||||
elif not line.strip():
|
||||
# check if the next line is a block element
|
||||
if i + 1 < len(self.lines) and (
|
||||
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
|
||||
or self.lines[i + 1].strip().startswith("```")
|
||||
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
|
||||
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
|
||||
or self.lines[i + 1].strip().startswith(">")
|
||||
):
|
||||
break
|
||||
else:
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"type": "text_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
1287
deepdoc/parser/pdf_parser.py
Normal file
1287
deepdoc/parser/pdf_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
99
deepdoc/parser/ppt_parser.py
Normal file
99
deepdoc/parser/ppt_parser.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class RAGFlowPptParser:
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __get_bulleted_text(self, paragraph):
|
||||
is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
|
||||
if is_bulleted:
|
||||
return f"{' '* paragraph.level}.{paragraph.text}"
|
||||
else:
|
||||
return paragraph.text
|
||||
|
||||
def __extract(self, shape):
|
||||
try:
|
||||
# First try to get text content
|
||||
if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
|
||||
text_frame = shape.text_frame
|
||||
texts = []
|
||||
for paragraph in text_frame.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
texts.append(self.__get_bulleted_text(paragraph))
|
||||
return "\n".join(texts)
|
||||
|
||||
# Safely get shape_type
|
||||
try:
|
||||
shape_type = shape.shape_type
|
||||
except NotImplementedError:
|
||||
# If shape_type is not available, try to get text content
|
||||
if hasattr(shape, 'text'):
|
||||
return shape.text.strip()
|
||||
return ""
|
||||
|
||||
# Handle table
|
||||
if shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
# Handle group shape
|
||||
if shape_type == 6:
|
||||
texts = []
|
||||
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing shape: {str(e)}")
|
||||
return ""
|
||||
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides):
|
||||
if i < from_page:
|
||||
continue
|
||||
if i >= to_page:
|
||||
break
|
||||
texts = []
|
||||
for shape in sorted(
|
||||
slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)):
|
||||
try:
|
||||
txt = self.__extract(shape)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
return txts
|
||||
109
deepdoc/parser/resume/__init__.py
Normal file
109
deepdoc/parser/resume/__init__.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import datetime
|
||||
|
||||
|
||||
def refactor(cv):
|
||||
for n in [
|
||||
"raw_txt",
|
||||
"parser_name",
|
||||
"inference",
|
||||
"ori_text",
|
||||
"use_time",
|
||||
"time_stat",
|
||||
]:
|
||||
if n in cv and cv[n] is not None:
|
||||
del cv[n]
|
||||
cv["is_deleted"] = 0
|
||||
if "basic" not in cv:
|
||||
cv["basic"] = {}
|
||||
if cv["basic"].get("photo2"):
|
||||
del cv["basic"]["photo2"]
|
||||
|
||||
for n in [
|
||||
"education",
|
||||
"work",
|
||||
"certificate",
|
||||
"project",
|
||||
"language",
|
||||
"skill",
|
||||
"training",
|
||||
]:
|
||||
if n not in cv or cv[n] is None:
|
||||
continue
|
||||
if isinstance(cv[n], dict):
|
||||
cv[n] = [v for _, v in cv[n].items()]
|
||||
if not isinstance(cv[n], list):
|
||||
del cv[n]
|
||||
continue
|
||||
vv = []
|
||||
for v in cv[n]:
|
||||
if "external" in v and v["external"] is not None:
|
||||
del v["external"]
|
||||
vv.append(v)
|
||||
cv[n] = {str(i): vv[i] for i in range(len(vv))}
|
||||
|
||||
basics = [
|
||||
("basic_salary_month", "salary_month"),
|
||||
("expect_annual_salary_from", "expect_annual_salary"),
|
||||
]
|
||||
for n, t in basics:
|
||||
if cv["basic"].get(n):
|
||||
cv["basic"][t] = cv["basic"][n]
|
||||
del cv["basic"][n]
|
||||
|
||||
work = sorted(
|
||||
[v for _, v in cv.get("work", {}).items()],
|
||||
key=lambda x: x.get("start_time", ""),
|
||||
)
|
||||
edu = sorted(
|
||||
[v for _, v in cv.get("education", {}).items()],
|
||||
key=lambda x: x.get("start_time", ""),
|
||||
)
|
||||
|
||||
if work:
|
||||
cv["basic"]["work_start_time"] = work[0].get("start_time", "")
|
||||
cv["basic"]["management_experience"] = (
|
||||
"Y"
|
||||
if any([w.get("management_experience", "") == "Y" for w in work])
|
||||
else "N"
|
||||
)
|
||||
cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
|
||||
|
||||
for n in [
|
||||
"annual_salary_from",
|
||||
"annual_salary_to",
|
||||
"industry_name",
|
||||
"position_name",
|
||||
"responsibilities",
|
||||
"corporation_type",
|
||||
"scale",
|
||||
"corporation_name",
|
||||
]:
|
||||
cv["basic"][n] = work[-1].get(n, "")
|
||||
|
||||
if edu:
|
||||
for n in ["school_name", "discipline_name"]:
|
||||
if n in edu[-1]:
|
||||
cv["basic"][n] = edu[-1][n]
|
||||
|
||||
cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if "contact" not in cv:
|
||||
cv["contact"] = {}
|
||||
if not cv["contact"].get("name"):
|
||||
cv["contact"]["name"] = cv["basic"].get("name", "")
|
||||
return cv
|
||||
15
deepdoc/parser/resume/entities/__init__.py
Normal file
15
deepdoc/parser/resume/entities/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
128
deepdoc/parser/resume/entities/corporations.py
Normal file
128
deepdoc/parser/resume/entities/corporations.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
import pandas as pd
|
||||
from rag.nlp import rag_tokenizer
|
||||
from . import regions
|
||||
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
GOODS = pd.read_csv(
|
||||
os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
|
||||
).fillna(0)
|
||||
GOODS["cid"] = GOODS["cid"].astype(str)
|
||||
GOODS = GOODS.set_index(["cid"])
|
||||
CORP_TKS = json.load(
|
||||
open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r",encoding="utf-8")
|
||||
)
|
||||
GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r",encoding="utf-8"))
|
||||
CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r",encoding="utf-8"))
|
||||
|
||||
|
||||
def baike(cid, default_v=0):
|
||||
global GOODS
|
||||
try:
|
||||
return GOODS.loc[str(cid), "len"]
|
||||
except Exception:
|
||||
pass
|
||||
return default_v
|
||||
|
||||
|
||||
def corpNorm(nm, add_region=True):
|
||||
global CORP_TKS
|
||||
if not nm or not isinstance(nm, str):
|
||||
return ""
|
||||
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
|
||||
nm = re.sub(r"&", "&", nm)
|
||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||
nm = re.sub(
|
||||
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
|
||||
)
|
||||
nm = re.sub(
|
||||
r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
|
||||
"",
|
||||
nm,
|
||||
count=10000,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
|
||||
return nm
|
||||
|
||||
tks = rag_tokenizer.tokenize(nm).split()
|
||||
reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||
nm = ""
|
||||
for t in tks:
|
||||
if regions.isName(t) or t in CORP_TKS:
|
||||
continue
|
||||
if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
|
||||
nm += " "
|
||||
nm += t
|
||||
|
||||
r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
|
||||
if r:
|
||||
nm = r.group(1)
|
||||
r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
|
||||
if r:
|
||||
nm = r.group(1)
|
||||
return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
|
||||
|
||||
|
||||
def rmNoise(n):
|
||||
n = re.sub(r"[\((][^()()]+[))]", "", n)
|
||||
n = re.sub(r"[,. &()()]+", "", n)
|
||||
return n
|
||||
|
||||
|
||||
GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
|
||||
for c, v in CORP_TAG.items():
|
||||
cc = corpNorm(rmNoise(c), False)
|
||||
if not cc:
|
||||
logging.debug(c)
|
||||
CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
|
||||
|
||||
|
||||
def is_good(nm):
|
||||
global GOOD_CORP
|
||||
if nm.find("外派") >= 0:
|
||||
return False
|
||||
nm = rmNoise(nm)
|
||||
nm = corpNorm(nm, False)
|
||||
for n in GOOD_CORP:
|
||||
if re.match(r"[0-9a-zA-Z]+$", n):
|
||||
if n == nm:
|
||||
return True
|
||||
elif nm.find(n) >= 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def corp_tag(nm):
|
||||
global CORP_TAG
|
||||
nm = rmNoise(nm)
|
||||
nm = corpNorm(nm, False)
|
||||
for n in CORP_TAG.keys():
|
||||
if re.match(r"[0-9a-zA-Z., ]+$", n):
|
||||
if n == nm:
|
||||
return CORP_TAG[n]
|
||||
elif nm.find(n) >= 0:
|
||||
if len(n) < 3 and len(nm) / len(n) >= 2:
|
||||
continue
|
||||
return CORP_TAG[n]
|
||||
return []
|
||||
44
deepdoc/parser/resume/entities/degrees.py
Normal file
44
deepdoc/parser/resume/entities/degrees.py
Normal file
@@ -0,0 +1,44 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
TBL = {
|
||||
"94": "EMBA",
|
||||
"6": "MBA",
|
||||
"95": "MPA",
|
||||
"92": "专升本",
|
||||
"4": "专科",
|
||||
"90": "中专",
|
||||
"91": "中技",
|
||||
"86": "初中",
|
||||
"3": "博士",
|
||||
"10": "博士后",
|
||||
"1": "本科",
|
||||
"2": "硕士",
|
||||
"87": "职高",
|
||||
"89": "高中",
|
||||
}
|
||||
|
||||
TBL_ = {v: k for k, v in TBL.items()}
|
||||
|
||||
|
||||
def get_name(id):
|
||||
return TBL.get(str(id), "")
|
||||
|
||||
|
||||
def get_id(nm):
|
||||
if not nm:
|
||||
return ""
|
||||
return TBL_.get(nm.upper().strip(), "")
|
||||
712
deepdoc/parser/resume/entities/industries.py
Normal file
712
deepdoc/parser/resume/entities/industries.py
Normal file
@@ -0,0 +1,712 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
TBL = {
|
||||
"1": {"name": "IT/通信/电子", "parent": "0"},
|
||||
"2": {"name": "互联网", "parent": "0"},
|
||||
"3": {"name": "电子商务", "parent": "2"},
|
||||
"4": {"name": "互联网金融", "parent": "2"},
|
||||
"5": {"name": "网络游戏", "parent": "2"},
|
||||
"6": {"name": "社交网络平台", "parent": "2"},
|
||||
"7": {"name": "视频音乐", "parent": "2"},
|
||||
"9": {"name": "安全", "parent": "2"},
|
||||
"10": {"name": "云计算", "parent": "2"},
|
||||
"12": {"name": "工具类客户端应用", "parent": "2"},
|
||||
"13": {"name": "互联网广告", "parent": "2"},
|
||||
"14": {"name": "企业互联网服务", "parent": "2"},
|
||||
"16": {"name": "在线教育", "parent": "2"},
|
||||
"17": {"name": "在线医疗", "parent": "2"},
|
||||
"19": {"name": "B2B", "parent": "3"},
|
||||
"20": {"name": "B2C", "parent": "3"},
|
||||
"21": {"name": "C2C", "parent": "3"},
|
||||
"22": {"name": "生活信息本地化", "parent": "3"},
|
||||
"23": {"name": "在线旅游", "parent": "2"},
|
||||
"24": {"name": "第三方支付", "parent": "4"},
|
||||
"26": {"name": "客户端游戏", "parent": "5"},
|
||||
"27": {"name": "网页游戏", "parent": "5"},
|
||||
"28": {"name": "手机游戏", "parent": "5"},
|
||||
"29": {"name": "微博", "parent": "6"},
|
||||
"30": {"name": "社交网站", "parent": "6"},
|
||||
"31": {"name": "在线视频", "parent": "7"},
|
||||
"32": {"name": "在线音乐", "parent": "7"},
|
||||
"35": {"name": "企业安全", "parent": "9"},
|
||||
"36": {"name": "个人安全", "parent": "9"},
|
||||
"37": {"name": "企业级云服务", "parent": "10"},
|
||||
"38": {"name": "个人级云服务", "parent": "10"},
|
||||
"43": {"name": "输入法", "parent": "12"},
|
||||
"44": {"name": "浏览器", "parent": "12"},
|
||||
"45": {"name": "词典", "parent": "12"},
|
||||
"46": {"name": "播放器", "parent": "12"},
|
||||
"47": {"name": "下载器", "parent": "12"},
|
||||
"48": {"name": "IM", "parent": "12"},
|
||||
"49": {"name": "广告服务", "parent": "13"},
|
||||
"50": {"name": "第三方广告网络平台", "parent": "13"},
|
||||
"51": {"name": "媒体代理", "parent": "13"},
|
||||
"52": {"name": "创意代理", "parent": "13"},
|
||||
"53": {"name": "IT-综合", "parent": "1"},
|
||||
"71": {"name": "团购", "parent": "3"},
|
||||
"72": {"name": "地图", "parent": "2"},
|
||||
"73": {"name": "数据存储", "parent": "2"},
|
||||
"414": {"name": "计算机软件", "parent": "1"},
|
||||
"415": {"name": "计算机硬件", "parent": "1"},
|
||||
"416": {"name": "计算机服务(系统、数据服务、维修)", "parent": "1"},
|
||||
"417": {"name": "通信/电信/网络设备", "parent": "1"},
|
||||
"418": {"name": "通信/电信运营、增值服务", "parent": "1"},
|
||||
"419": {"name": "电子技术/半导体/集成电路", "parent": "1"},
|
||||
"472": {"name": "P2P网贷", "parent": "4"},
|
||||
"473": {"name": "互联网理财", "parent": "4"},
|
||||
"474": {"name": "婚恋", "parent": "6"},
|
||||
"476": {"name": "虚拟化", "parent": "10"},
|
||||
"477": {"name": "邮箱", "parent": "12"},
|
||||
"478": {"name": "商业智能", "parent": "14"},
|
||||
"479": {"name": "企业建站", "parent": "14"},
|
||||
"480": {"name": "安防", "parent": "14"},
|
||||
"481": {"name": "网络营销", "parent": "2"},
|
||||
"487": {"name": "智能终端", "parent": "2"},
|
||||
"488": {"name": "移动互联网", "parent": "2"},
|
||||
"489": {"name": "数字城市", "parent": "2"},
|
||||
"490": {"name": "大数据", "parent": "2"},
|
||||
"491": {"name": "互联网人力资源", "parent": "2"},
|
||||
"492": {"name": "舆情监控", "parent": "2"},
|
||||
"493": {"name": "移动营销", "parent": "481"},
|
||||
"494": {"name": "微博营销", "parent": "481"},
|
||||
"495": {"name": "精准营销", "parent": "481"},
|
||||
"496": {"name": "海外营销", "parent": "481"},
|
||||
"497": {"name": "微信营销", "parent": "481"},
|
||||
"498": {"name": "智能手机", "parent": "487"},
|
||||
"499": {"name": "可穿戴设备", "parent": "487"},
|
||||
"500": {"name": "智能电视", "parent": "487"},
|
||||
"501": {"name": "WAP", "parent": "488"},
|
||||
"502": {"name": "物联网", "parent": "489"},
|
||||
"503": {"name": "O2O", "parent": "489"},
|
||||
"504": {"name": "数字出版", "parent": "489"},
|
||||
"505": {"name": "搜索", "parent": "2"},
|
||||
"506": {"name": "垂直搜索", "parent": "505"},
|
||||
"507": {"name": "无线搜索", "parent": "505"},
|
||||
"508": {"name": "网页搜索", "parent": "505"},
|
||||
"509": {"name": "网址导航", "parent": "2"},
|
||||
"510": {"name": "门户", "parent": "2"},
|
||||
"511": {"name": "网络文学", "parent": "2"},
|
||||
"512": {"name": "自媒体", "parent": "2"},
|
||||
"513": {"name": "金融", "parent": "0"},
|
||||
"514": {"name": "建筑与房地产", "parent": "0"},
|
||||
"515": {"name": "专业服务", "parent": "0"},
|
||||
"516": {"name": "教育培训", "parent": "0"},
|
||||
"517": {"name": "文化传媒", "parent": "0"},
|
||||
"518": {"name": "消费品", "parent": "0"},
|
||||
"519": {"name": "工业", "parent": "0"},
|
||||
"520": {"name": "交通物流", "parent": "0"},
|
||||
"521": {"name": "贸易", "parent": "0"},
|
||||
"522": {"name": "医药", "parent": "0"},
|
||||
"523": {"name": "医疗器械", "parent": "522"},
|
||||
"524": {"name": "保健品", "parent": "518"},
|
||||
"525": {"name": "服务业", "parent": "0"},
|
||||
"526": {"name": "能源/矿产/环保", "parent": "0"},
|
||||
"527": {"name": "化工", "parent": "0"},
|
||||
"528": {"name": "政府", "parent": "0"},
|
||||
"529": {"name": "公共事业", "parent": "0"},
|
||||
"530": {"name": "非盈利机构", "parent": "0"},
|
||||
"531": {"name": "农业", "parent": "1131"},
|
||||
"532": {"name": "林业", "parent": "1131"},
|
||||
"533": {"name": "畜牧业", "parent": "1131"},
|
||||
"534": {"name": "渔业", "parent": "1131"},
|
||||
"535": {"name": "学术科研", "parent": "0"},
|
||||
"536": {"name": "零售", "parent": "0"},
|
||||
"537": {"name": "银行", "parent": "513"},
|
||||
"538": {"name": "保险", "parent": "513"},
|
||||
"539": {"name": "证券", "parent": "513"},
|
||||
"540": {"name": "基金", "parent": "513"},
|
||||
"541": {"name": "信托", "parent": "513"},
|
||||
"542": {"name": "担保", "parent": "513"},
|
||||
"543": {"name": "典当", "parent": "513"},
|
||||
"544": {"name": "拍卖", "parent": "513"},
|
||||
"545": {"name": "投资/融资", "parent": "513"},
|
||||
"546": {"name": "期货", "parent": "513"},
|
||||
"547": {"name": "房地产开发", "parent": "514"},
|
||||
"548": {"name": "工程施工", "parent": "514"},
|
||||
"549": {"name": "建筑设计", "parent": "514"},
|
||||
"550": {"name": "房地产代理", "parent": "514"},
|
||||
"551": {"name": "物业管理", "parent": "514"},
|
||||
"552": {"name": "室内设计", "parent": "514"},
|
||||
"553": {"name": "装修装潢", "parent": "514"},
|
||||
"554": {"name": "市政工程", "parent": "514"},
|
||||
"555": {"name": "工程造价", "parent": "514"},
|
||||
"556": {"name": "工程监理", "parent": "514"},
|
||||
"557": {"name": "环境工程", "parent": "514"},
|
||||
"558": {"name": "园林景观", "parent": "514"},
|
||||
"559": {"name": "法律", "parent": "515"},
|
||||
"560": {"name": "人力资源", "parent": "515"},
|
||||
"561": {"name": "会计", "parent": "1125"},
|
||||
"562": {"name": "审计", "parent": "515"},
|
||||
"563": {"name": "检测认证", "parent": "515"},
|
||||
"565": {"name": "翻译", "parent": "515"},
|
||||
"566": {"name": "中介", "parent": "515"},
|
||||
"567": {"name": "咨询", "parent": "515"},
|
||||
"568": {"name": "外包服务", "parent": "515"},
|
||||
"569": {"name": "家教", "parent": "516"},
|
||||
"570": {"name": "早教", "parent": "516"},
|
||||
"571": {"name": "职业技能培训", "parent": "516"},
|
||||
"572": {"name": "外语培训", "parent": "516"},
|
||||
"573": {"name": "设计培训", "parent": "516"},
|
||||
"574": {"name": "IT培训", "parent": "516"},
|
||||
"575": {"name": "文艺体育培训", "parent": "516"},
|
||||
"576": {"name": "学历教育", "parent": "516"},
|
||||
"577": {"name": "管理培训", "parent": "516"},
|
||||
"578": {"name": "民办基础教育", "parent": "516"},
|
||||
"579": {"name": "广告", "parent": "517"},
|
||||
"580": {"name": "媒体", "parent": "517"},
|
||||
"581": {"name": "会展", "parent": "517"},
|
||||
"582": {"name": "公关", "parent": "517"},
|
||||
"583": {"name": "影视", "parent": "517"},
|
||||
"584": {"name": "艺术", "parent": "517"},
|
||||
"585": {"name": "文化传播", "parent": "517"},
|
||||
"586": {"name": "娱乐", "parent": "517"},
|
||||
"587": {"name": "体育", "parent": "517"},
|
||||
"588": {"name": "出版", "parent": "517"},
|
||||
"589": {"name": "休闲", "parent": "517"},
|
||||
"590": {"name": "动漫", "parent": "517"},
|
||||
"591": {"name": "市场推广", "parent": "517"},
|
||||
"592": {"name": "市场研究", "parent": "517"},
|
||||
"593": {"name": "食品", "parent": "1129"},
|
||||
"594": {"name": "饮料", "parent": "1129"},
|
||||
"595": {"name": "烟草", "parent": "1129"},
|
||||
"596": {"name": "酒品", "parent": "518"},
|
||||
"597": {"name": "服饰", "parent": "518"},
|
||||
"598": {"name": "纺织", "parent": "518"},
|
||||
"599": {"name": "化妆品", "parent": "1129"},
|
||||
"600": {"name": "日用品", "parent": "1129"},
|
||||
"601": {"name": "家电", "parent": "518"},
|
||||
"602": {"name": "家具", "parent": "518"},
|
||||
"603": {"name": "办公用品", "parent": "518"},
|
||||
"604": {"name": "奢侈品", "parent": "518"},
|
||||
"605": {"name": "珠宝", "parent": "518"},
|
||||
"606": {"name": "数码产品", "parent": "518"},
|
||||
"607": {"name": "玩具", "parent": "518"},
|
||||
"608": {"name": "图书", "parent": "518"},
|
||||
"609": {"name": "音像", "parent": "518"},
|
||||
"610": {"name": "钟表", "parent": "518"},
|
||||
"611": {"name": "箱包", "parent": "518"},
|
||||
"612": {"name": "母婴", "parent": "518"},
|
||||
"613": {"name": "营养保健", "parent": "518"},
|
||||
"614": {"name": "户外用品", "parent": "518"},
|
||||
"615": {"name": "健身器材", "parent": "518"},
|
||||
"616": {"name": "乐器", "parent": "518"},
|
||||
"617": {"name": "汽车用品", "parent": "518"},
|
||||
"619": {"name": "厨具", "parent": "518"},
|
||||
"620": {"name": "机械制造", "parent": "519"},
|
||||
"621": {"name": "流体控制", "parent": "519"},
|
||||
"622": {"name": "自动化控制", "parent": "519"},
|
||||
"623": {"name": "仪器仪表", "parent": "519"},
|
||||
"624": {"name": "航空/航天", "parent": "519"},
|
||||
"625": {"name": "交通设施", "parent": "519"},
|
||||
"626": {"name": "工业电子", "parent": "519"},
|
||||
"627": {"name": "建材", "parent": "519"},
|
||||
"628": {"name": "五金材料", "parent": "519"},
|
||||
"629": {"name": "汽车", "parent": "519"},
|
||||
"630": {"name": "印刷", "parent": "519"},
|
||||
"631": {"name": "造纸", "parent": "519"},
|
||||
"632": {"name": "包装", "parent": "519"},
|
||||
"633": {"name": "原材料及加工", "parent": "519"},
|
||||
"634": {"name": "物流", "parent": "520"},
|
||||
"635": {"name": "仓储", "parent": "520"},
|
||||
"636": {"name": "客运", "parent": "520"},
|
||||
"637": {"name": "快递", "parent": "520"},
|
||||
"638": {"name": "化学药", "parent": "522"},
|
||||
"639": {"name": "中药", "parent": "522"},
|
||||
"640": {"name": "生物制药", "parent": "522"},
|
||||
"641": {"name": "兽药", "parent": "522"},
|
||||
"642": {"name": "农药", "parent": "522"},
|
||||
"643": {"name": "CRO", "parent": "522"},
|
||||
"644": {"name": "消毒", "parent": "522"},
|
||||
"645": {"name": "医药商业", "parent": "522"},
|
||||
"646": {"name": "医疗服务", "parent": "522"},
|
||||
"647": {"name": "医疗器械", "parent": "523"},
|
||||
"648": {"name": "制药设备", "parent": "523"},
|
||||
"649": {"name": "医用耗材", "parent": "523"},
|
||||
"650": {"name": "手术器械", "parent": "523"},
|
||||
"651": {"name": "保健器材", "parent": "524"},
|
||||
"652": {"name": "性保健品", "parent": "524"},
|
||||
"653": {"name": "医药保养", "parent": "524"},
|
||||
"654": {"name": "医用保健", "parent": "524"},
|
||||
"655": {"name": "酒店", "parent": "525"},
|
||||
"656": {"name": "餐饮", "parent": "525"},
|
||||
"657": {"name": "旅游", "parent": "525"},
|
||||
"658": {"name": "生活服务", "parent": "525"},
|
||||
"659": {"name": "保健服务", "parent": "525"},
|
||||
"660": {"name": "运动健身", "parent": "525"},
|
||||
"661": {"name": "家政服务", "parent": "525"},
|
||||
"662": {"name": "婚庆服务", "parent": "525"},
|
||||
"663": {"name": "租赁服务", "parent": "525"},
|
||||
"664": {"name": "维修服务", "parent": "525"},
|
||||
"665": {"name": "石油天然气", "parent": "526"},
|
||||
"666": {"name": "电力", "parent": "526"},
|
||||
"667": {"name": "新能源", "parent": "526"},
|
||||
"668": {"name": "水利", "parent": "526"},
|
||||
"669": {"name": "矿产", "parent": "526"},
|
||||
"670": {"name": "采掘业", "parent": "526"},
|
||||
"671": {"name": "冶炼", "parent": "526"},
|
||||
"672": {"name": "环保", "parent": "526"},
|
||||
"673": {"name": "无机化工原料", "parent": "527"},
|
||||
"674": {"name": "有机化工原料", "parent": "527"},
|
||||
"675": {"name": "精细化学品", "parent": "527"},
|
||||
"676": {"name": "化工设备", "parent": "527"},
|
||||
"677": {"name": "化工工程", "parent": "527"},
|
||||
"678": {"name": "资产管理", "parent": "513"},
|
||||
"679": {"name": "金融租赁", "parent": "513"},
|
||||
"680": {"name": "征信及信评机构", "parent": "513"},
|
||||
"681": {"name": "资产评估机构", "parent": "513"},
|
||||
"683": {"name": "金融监管机构", "parent": "513"},
|
||||
"684": {"name": "国际贸易", "parent": "521"},
|
||||
"685": {"name": "海关", "parent": "521"},
|
||||
"686": {"name": "购物中心", "parent": "536"},
|
||||
"687": {"name": "超市", "parent": "536"},
|
||||
"688": {"name": "便利店", "parent": "536"},
|
||||
"689": {"name": "专卖店", "parent": "536"},
|
||||
"690": {"name": "专业店", "parent": "536"},
|
||||
"691": {"name": "百货店", "parent": "536"},
|
||||
"692": {"name": "杂货店", "parent": "536"},
|
||||
"693": {"name": "个人银行", "parent": "537"},
|
||||
"695": {"name": "私人银行", "parent": "537"},
|
||||
"696": {"name": "公司银行", "parent": "537"},
|
||||
"697": {"name": "投资银行", "parent": "537"},
|
||||
"698": {"name": "政策性银行", "parent": "537"},
|
||||
"699": {"name": "中央银行", "parent": "537"},
|
||||
"700": {"name": "人寿险", "parent": "538"},
|
||||
"701": {"name": "财产险", "parent": "538"},
|
||||
"702": {"name": "再保险", "parent": "538"},
|
||||
"703": {"name": "养老险", "parent": "538"},
|
||||
"704": {"name": "保险代理公司", "parent": "538"},
|
||||
"705": {"name": "公募基金", "parent": "540"},
|
||||
"707": {"name": "私募基金", "parent": "540"},
|
||||
"708": {"name": "第三方理财", "parent": "679"},
|
||||
"709": {"name": "资产管理公司", "parent": "679"},
|
||||
"711": {"name": "房产中介", "parent": "566"},
|
||||
"712": {"name": "职业中介", "parent": "566"},
|
||||
"713": {"name": "婚姻中介", "parent": "566"},
|
||||
"714": {"name": "战略咨询", "parent": "567"},
|
||||
"715": {"name": "投资咨询", "parent": "567"},
|
||||
"716": {"name": "心理咨询", "parent": "567"},
|
||||
"717": {"name": "留学移民咨询", "parent": "567"},
|
||||
"718": {"name": "工商注册代理", "parent": "568"},
|
||||
"719": {"name": "商标专利代理", "parent": "568"},
|
||||
"720": {"name": "财务代理", "parent": "568"},
|
||||
"721": {"name": "工程机械", "parent": "620"},
|
||||
"722": {"name": "农业机械", "parent": "620"},
|
||||
"723": {"name": "海工设备", "parent": "620"},
|
||||
"724": {"name": "包装机械", "parent": "620"},
|
||||
"725": {"name": "印刷机械", "parent": "620"},
|
||||
"726": {"name": "数控机床", "parent": "620"},
|
||||
"727": {"name": "矿山机械", "parent": "620"},
|
||||
"728": {"name": "水泵", "parent": "621"},
|
||||
"729": {"name": "管道", "parent": "621"},
|
||||
"730": {"name": "阀门", "parent": "621"},
|
||||
"732": {"name": "压缩机", "parent": "621"},
|
||||
"733": {"name": "集散控制系统", "parent": "622"},
|
||||
"734": {"name": "远程控制", "parent": "622"},
|
||||
"735": {"name": "液压系统", "parent": "622"},
|
||||
"736": {"name": "楼宇智能化", "parent": "622"},
|
||||
"737": {"name": "飞机制造", "parent": "624"},
|
||||
"738": {"name": "航空公司", "parent": "624"},
|
||||
"739": {"name": "发动机", "parent": "624"},
|
||||
"740": {"name": "复合材料", "parent": "624"},
|
||||
"741": {"name": "高铁", "parent": "625"},
|
||||
"742": {"name": "地铁", "parent": "625"},
|
||||
"743": {"name": "信号传输", "parent": "625"},
|
||||
"745": {"name": "结构材料", "parent": "627"},
|
||||
"746": {"name": "装饰材料", "parent": "627"},
|
||||
"747": {"name": "专用材料", "parent": "627"},
|
||||
"749": {"name": "经销商集团", "parent": "629"},
|
||||
"750": {"name": "整车制造", "parent": "629"},
|
||||
"751": {"name": "汽车零配件", "parent": "629"},
|
||||
"752": {"name": "外型设计", "parent": "629"},
|
||||
"753": {"name": "平版印刷", "parent": "630"},
|
||||
"754": {"name": "凸版印刷", "parent": "630"},
|
||||
"755": {"name": "凹版印刷", "parent": "630"},
|
||||
"756": {"name": "孔版印刷", "parent": "630"},
|
||||
"757": {"name": "印刷用纸", "parent": "631"},
|
||||
"758": {"name": "书写、制图及复制用纸", "parent": "631"},
|
||||
"759": {"name": "包装用纸", "parent": "631"},
|
||||
"760": {"name": "生活、卫生及装饰用纸", "parent": "631"},
|
||||
"761": {"name": "技术用纸", "parent": "631"},
|
||||
"762": {"name": "加工纸原纸", "parent": "631"},
|
||||
"763": {"name": "食品包装", "parent": "632"},
|
||||
"764": {"name": "医药包装", "parent": "632"},
|
||||
"765": {"name": "日化包装", "parent": "632"},
|
||||
"766": {"name": "物流包装", "parent": "632"},
|
||||
"767": {"name": "礼品包装", "parent": "632"},
|
||||
"768": {"name": "电子五金包装", "parent": "632"},
|
||||
"769": {"name": "汽车服务", "parent": "525"},
|
||||
"770": {"name": "汽车保养", "parent": "769"},
|
||||
"771": {"name": "租车", "parent": "769"},
|
||||
"773": {"name": "出租车", "parent": "769"},
|
||||
"774": {"name": "代驾", "parent": "769"},
|
||||
"775": {"name": "发电", "parent": "666"},
|
||||
"777": {"name": "输配电", "parent": "666"},
|
||||
"779": {"name": "风电", "parent": "667"},
|
||||
"780": {"name": "光伏/太阳能", "parent": "667"},
|
||||
"781": {"name": "生物质发电", "parent": "667"},
|
||||
"782": {"name": "煤化工", "parent": "667"},
|
||||
"783": {"name": "垃圾发电", "parent": "667"},
|
||||
"784": {"name": "核电", "parent": "667"},
|
||||
"785": {"name": "能源矿产", "parent": "669"},
|
||||
"786": {"name": "金属矿产", "parent": "669"},
|
||||
"787": {"name": "非金属矿产", "parent": "669"},
|
||||
"788": {"name": "水气矿产", "parent": "669"},
|
||||
"789": {"name": "锅炉", "parent": "775"},
|
||||
"790": {"name": "发电机", "parent": "775"},
|
||||
"791": {"name": "汽轮机", "parent": "775"},
|
||||
"792": {"name": "燃机", "parent": "775"},
|
||||
"793": {"name": "冷却", "parent": "775"},
|
||||
"794": {"name": "电力设计院", "parent": "775"},
|
||||
"795": {"name": "高压输配电", "parent": "777"},
|
||||
"796": {"name": "中压输配电", "parent": "777"},
|
||||
"797": {"name": "低压输配电", "parent": "777"},
|
||||
"798": {"name": "继电保护", "parent": "777"},
|
||||
"799": {"name": "智能电网", "parent": "777"},
|
||||
"800": {"name": "小学", "parent": "516"},
|
||||
"801": {"name": "电动车", "parent": "519"},
|
||||
"802": {"name": "皮具箱包", "parent": "518"},
|
||||
"803": {"name": "医药制造", "parent": "522"},
|
||||
"804": {"name": "电器销售", "parent": "536"},
|
||||
"805": {"name": "塑料制品", "parent": "527"},
|
||||
"806": {"name": "公益基金会", "parent": "530"},
|
||||
"807": {"name": "美发服务", "parent": "525"},
|
||||
"808": {"name": "农业养殖", "parent": "531"},
|
||||
"809": {"name": "金融服务", "parent": "513"},
|
||||
"810": {"name": "商业地产综合体", "parent": "514"},
|
||||
"811": {"name": "美容服务", "parent": "525"},
|
||||
"812": {"name": "灯饰", "parent": "518"},
|
||||
"813": {"name": "油墨颜料产品", "parent": "527"},
|
||||
"814": {"name": "眼镜制造", "parent": "518"},
|
||||
"815": {"name": "农业生物技术", "parent": "531"},
|
||||
"816": {"name": "体育用品", "parent": "518"},
|
||||
"817": {"name": "保健用品", "parent": "524"},
|
||||
"818": {"name": "化学化工产品", "parent": "527"},
|
||||
"819": {"name": "饲料", "parent": "531"},
|
||||
"821": {"name": "保安服务", "parent": "525"},
|
||||
"822": {"name": "干细胞技术", "parent": "522"},
|
||||
"824": {"name": "农药化肥", "parent": "527"},
|
||||
"825": {"name": "卫生洁具", "parent": "518"},
|
||||
"826": {"name": "体育器材、场馆", "parent": "518"},
|
||||
"827": {"name": "饲料加工", "parent": "531"},
|
||||
"828": {"name": "测绘服务", "parent": "529"},
|
||||
"830": {"name": "金属船舶制造", "parent": "519"},
|
||||
"831": {"name": "基因工程", "parent": "522"},
|
||||
"832": {"name": "花卉服务", "parent": "536"},
|
||||
"833": {"name": "农业种植", "parent": "531"},
|
||||
"834": {"name": "皮革制品", "parent": "518"},
|
||||
"835": {"name": "地理信息加工服务", "parent": "529"},
|
||||
"836": {"name": "机器人", "parent": "519"},
|
||||
"837": {"name": "礼品", "parent": "518"},
|
||||
"838": {"name": "理发及美容服务", "parent": "525"},
|
||||
"839": {"name": "其他清洁服务", "parent": "525"},
|
||||
"840": {"name": "硅胶材料", "parent": "527"},
|
||||
"841": {"name": "茶叶销售", "parent": "518"},
|
||||
"842": {"name": "彩票活动", "parent": "529"},
|
||||
"843": {"name": "化妆培训", "parent": "516"},
|
||||
"844": {"name": "鞋业", "parent": "518"},
|
||||
"845": {"name": "酒店用品", "parent": "518"},
|
||||
"846": {"name": "复合材料", "parent": "527"},
|
||||
"847": {"name": "房地产工程建设", "parent": "548"},
|
||||
"848": {"name": "知识产权服务", "parent": "559"},
|
||||
"849": {"name": "新型建材", "parent": "627"},
|
||||
"850": {"name": "企业投资咨询", "parent": "567"},
|
||||
"851": {"name": "含乳饮料和植物蛋白饮料制造", "parent": "594"},
|
||||
"852": {"name": "汽车检测设备", "parent": "629"},
|
||||
"853": {"name": "手机通讯器材", "parent": "417"},
|
||||
"854": {"name": "环保材料", "parent": "672"},
|
||||
"855": {"name": "交通设施", "parent": "554"},
|
||||
"856": {"name": "电子器件", "parent": "419"},
|
||||
"857": {"name": "啤酒", "parent": "594"},
|
||||
"858": {"name": "生态旅游", "parent": "657"},
|
||||
"859": {"name": "自动化设备", "parent": "626"},
|
||||
"860": {"name": "软件开发", "parent": "414"},
|
||||
"861": {"name": "葡萄酒销售", "parent": "594"},
|
||||
"862": {"name": "钢材", "parent": "633"},
|
||||
"863": {"name": "餐饮培训", "parent": "656"},
|
||||
"864": {"name": "速冻食品", "parent": "593"},
|
||||
"865": {"name": "空气环保", "parent": "672"},
|
||||
"866": {"name": "互联网房地产经纪服务", "parent": "550"},
|
||||
"867": {"name": "食品添加剂", "parent": "593"},
|
||||
"868": {"name": "演艺传播", "parent": "585"},
|
||||
"869": {"name": "信用卡", "parent": "537"},
|
||||
"870": {"name": "报纸期刊广告", "parent": "579"},
|
||||
"871": {"name": "摄影", "parent": "525"},
|
||||
"872": {"name": "手机软件", "parent": "414"},
|
||||
"873": {"name": "地坪建材", "parent": "627"},
|
||||
"874": {"name": "企业管理咨询", "parent": "567"},
|
||||
"875": {"name": "幼儿教育", "parent": "570"},
|
||||
"876": {"name": "系统集成", "parent": "416"},
|
||||
"877": {"name": "皮革服饰", "parent": "597"},
|
||||
"878": {"name": "保健食品", "parent": "593"},
|
||||
"879": {"name": "叉车", "parent": "620"},
|
||||
"880": {"name": "厨卫电器", "parent": "601"},
|
||||
"882": {"name": "地暖设备", "parent": "627"},
|
||||
"883": {"name": "钢结构制造", "parent": "548"},
|
||||
"884": {"name": "投影机", "parent": "606"},
|
||||
"885": {"name": "啤酒销售", "parent": "594"},
|
||||
"886": {"name": "度假村旅游", "parent": "657"},
|
||||
"887": {"name": "电力元件设备", "parent": "626"},
|
||||
"888": {"name": "管理软件", "parent": "414"},
|
||||
"889": {"name": "轴承", "parent": "628"},
|
||||
"890": {"name": "餐饮设备", "parent": "656"},
|
||||
"891": {"name": "肉制品及副产品加工", "parent": "593"},
|
||||
"892": {"name": "艺术收藏品投资交易", "parent": "584"},
|
||||
"893": {"name": "净水器", "parent": "601"},
|
||||
"894": {"name": "进口食品", "parent": "593"},
|
||||
"895": {"name": "娱乐文化传播", "parent": "585"},
|
||||
"896": {"name": "文化传播", "parent": "585"},
|
||||
"897": {"name": "商旅传媒", "parent": "580"},
|
||||
"898": {"name": "广告设计制作", "parent": "579"},
|
||||
"899": {"name": "金属丝绳及其制品制造", "parent": "627"},
|
||||
"900": {"name": "建筑涂料", "parent": "627"},
|
||||
"901": {"name": "抵押贷款", "parent": "543"},
|
||||
"902": {"name": "早教", "parent": "570"},
|
||||
"903": {"name": "电影放映", "parent": "583"},
|
||||
"904": {"name": "内衣服饰", "parent": "597"},
|
||||
"905": {"name": "无线网络通信", "parent": "418"},
|
||||
"906": {"name": "记忆卡", "parent": "415"},
|
||||
"907": {"name": "女装服饰", "parent": "597"},
|
||||
"908": {"name": "建筑机械", "parent": "620"},
|
||||
"909": {"name": "制冷电器", "parent": "601"},
|
||||
"910": {"name": "通信设备", "parent": "417"},
|
||||
"911": {"name": "空调设备", "parent": "601"},
|
||||
"912": {"name": "建筑装饰", "parent": "553"},
|
||||
"913": {"name": "办公设备", "parent": "603"},
|
||||
"916": {"name": "数据处理软件", "parent": "414"},
|
||||
"917": {"name": "葡萄酒贸易", "parent": "594"},
|
||||
"918": {"name": "通讯器材", "parent": "417"},
|
||||
"919": {"name": "铜业", "parent": "633"},
|
||||
"920": {"name": "食堂", "parent": "656"},
|
||||
"921": {"name": "糖果零食", "parent": "593"},
|
||||
"922": {"name": "文化艺术传播", "parent": "584"},
|
||||
"923": {"name": "太阳能电器", "parent": "601"},
|
||||
"924": {"name": "药品零售", "parent": "645"},
|
||||
"925": {"name": "果蔬食品", "parent": "593"},
|
||||
"926": {"name": "文化活动策划", "parent": "585"},
|
||||
"928": {"name": "汽车广告", "parent": "657"},
|
||||
"929": {"name": "条码设备", "parent": "630"},
|
||||
"930": {"name": "建筑石材", "parent": "627"},
|
||||
"931": {"name": "贵金属", "parent": "545"},
|
||||
"932": {"name": "体育", "parent": "660"},
|
||||
"933": {"name": "金融信息服务", "parent": "414"},
|
||||
"934": {"name": "玻璃建材", "parent": "627"},
|
||||
"935": {"name": "家教", "parent": "569"},
|
||||
"936": {"name": "歌舞厅娱乐活动", "parent": "586"},
|
||||
"937": {"name": "计算机服务器", "parent": "415"},
|
||||
"938": {"name": "管道", "parent": "627"},
|
||||
"939": {"name": "婴幼儿服饰", "parent": "597"},
|
||||
"940": {"name": "热水器", "parent": "601"},
|
||||
"941": {"name": "计算机及零部件制造", "parent": "415"},
|
||||
"942": {"name": "钢铁贸易", "parent": "633"},
|
||||
"944": {"name": "包装材料", "parent": "632"},
|
||||
"945": {"name": "计算机办公设备", "parent": "603"},
|
||||
"946": {"name": "白酒", "parent": "594"},
|
||||
"948": {"name": "发动机", "parent": "620"},
|
||||
"949": {"name": "快餐服务", "parent": "656"},
|
||||
"950": {"name": "酒类销售", "parent": "594"},
|
||||
"951": {"name": "电子产品、机电设备", "parent": "626"},
|
||||
"952": {"name": "激光设备", "parent": "626"},
|
||||
"953": {"name": "餐饮策划", "parent": "656"},
|
||||
"954": {"name": "饮料、食品", "parent": "594"},
|
||||
"955": {"name": "文化娱乐经纪", "parent": "585"},
|
||||
"956": {"name": "天然气", "parent": "665"},
|
||||
"957": {"name": "农副食品", "parent": "593"},
|
||||
"958": {"name": "艺术表演", "parent": "585"},
|
||||
"959": {"name": "石膏、水泥制品及类似制品制造", "parent": "627"},
|
||||
"960": {"name": "橱柜", "parent": "602"},
|
||||
"961": {"name": "管理培训", "parent": "577"},
|
||||
"962": {"name": "男装服饰", "parent": "597"},
|
||||
"963": {"name": "化肥制造", "parent": "675"},
|
||||
"964": {"name": "童装服饰", "parent": "597"},
|
||||
"965": {"name": "电源电池", "parent": "626"},
|
||||
"966": {"name": "家电维修", "parent": "664"},
|
||||
"967": {"name": "光电子器件", "parent": "419"},
|
||||
"968": {"name": "旅行社服务", "parent": "657"},
|
||||
"969": {"name": "电线、电缆制造", "parent": "626"},
|
||||
"970": {"name": "软件开发、信息系统集成", "parent": "419"},
|
||||
"971": {"name": "白酒制造", "parent": "594"},
|
||||
"973": {"name": "甜品服务", "parent": "656"},
|
||||
"974": {"name": "糕点、面包制造", "parent": "593"},
|
||||
"975": {"name": "木工机械", "parent": "620"},
|
||||
"976": {"name": "酒吧服务", "parent": "656"},
|
||||
"977": {"name": "火腿肠", "parent": "593"},
|
||||
"978": {"name": "广告策划推广", "parent": "579"},
|
||||
"979": {"name": "新能源产品和生产装备制造", "parent": "667"},
|
||||
"980": {"name": "调味品", "parent": "593"},
|
||||
"981": {"name": "礼仪表演", "parent": "585"},
|
||||
"982": {"name": "劳务派遣", "parent": "560"},
|
||||
"983": {"name": "建材零售", "parent": "627"},
|
||||
"984": {"name": "商品交易中心", "parent": "545"},
|
||||
"985": {"name": "体育推广", "parent": "585"},
|
||||
"986": {"name": "茶饮料及其他饮料制造", "parent": "594"},
|
||||
"987": {"name": "金属建材", "parent": "627"},
|
||||
"988": {"name": "职业技能培训", "parent": "571"},
|
||||
"989": {"name": "网吧活动", "parent": "586"},
|
||||
"990": {"name": "洗衣服务", "parent": "658"},
|
||||
"991": {"name": "管道工程", "parent": "554"},
|
||||
"992": {"name": "通信工程", "parent": "417"},
|
||||
"993": {"name": "电子元器件", "parent": "626"},
|
||||
"994": {"name": "电子设备", "parent": "419"},
|
||||
"995": {"name": "茶馆服务", "parent": "656"},
|
||||
"996": {"name": "旅游开发", "parent": "657"},
|
||||
"997": {"name": "视频通讯", "parent": "417"},
|
||||
"998": {"name": "白酒销售", "parent": "594"},
|
||||
"1000": {"name": "咖啡馆服务", "parent": "656"},
|
||||
"1001": {"name": "食品零售", "parent": "593"},
|
||||
"1002": {"name": "健康疗养旅游", "parent": "655"},
|
||||
"1003": {"name": "粮油食品", "parent": "593"},
|
||||
"1004": {"name": "儿童教育影视", "parent": "583"},
|
||||
"1005": {"name": "新能源发电", "parent": "667"},
|
||||
"1006": {"name": "旅游策划", "parent": "657"},
|
||||
"1007": {"name": "绘画", "parent": "575"},
|
||||
"1008": {"name": "方便面及其他方便食品", "parent": "593"},
|
||||
"1009": {"name": "房地产经纪", "parent": "550"},
|
||||
"1010": {"name": "母婴家政", "parent": "661"},
|
||||
"1011": {"name": "居家养老健康服务", "parent": "661"},
|
||||
"1012": {"name": "文化艺术投资", "parent": "545"},
|
||||
"1013": {"name": "运动健身", "parent": "660"},
|
||||
"1014": {"name": "瓶(罐)装饮用水制造", "parent": "594"},
|
||||
"1015": {"name": "金属门窗", "parent": "627"},
|
||||
"1016": {"name": "机动车检测", "parent": "563"},
|
||||
"1017": {"name": "货物运输", "parent": "634"},
|
||||
"1018": {"name": "服饰专卖", "parent": "690"},
|
||||
"1019": {"name": "酒店服装", "parent": "597"},
|
||||
"1020": {"name": "通讯软件", "parent": "417"},
|
||||
"1021": {"name": "消防工程", "parent": "554"},
|
||||
"1022": {"name": "嵌入式电子系统", "parent": "419"},
|
||||
"1023": {"name": "航空票务", "parent": "636"},
|
||||
"1024": {"name": "电气设备", "parent": "626"},
|
||||
"1025": {"name": "酒业贸易", "parent": "594"},
|
||||
"1027": {"name": "其他饮料及冷饮服务", "parent": "656"},
|
||||
"1028": {"name": "乳制品", "parent": "593"},
|
||||
"1029": {"name": "新闻期刊出版", "parent": "588"},
|
||||
"1030": {"name": "水污染治理", "parent": "672"},
|
||||
"1031": {"name": "谷物食品", "parent": "593"},
|
||||
"1032": {"name": "数字动漫设计制造服务", "parent": "590"},
|
||||
"1033": {"name": "医院", "parent": "646"},
|
||||
"1034": {"name": "旅游广告", "parent": "657"},
|
||||
"1035": {"name": "办公家具", "parent": "602"},
|
||||
"1036": {"name": "房地产营销策划", "parent": "550"},
|
||||
"1037": {"name": "保洁家政", "parent": "661"},
|
||||
"1038": {"name": "水泥制造", "parent": "627"},
|
||||
"1039": {"name": "市场研究咨询", "parent": "567"},
|
||||
"1040": {"name": "驾校", "parent": "571"},
|
||||
"1041": {"name": "正餐服务", "parent": "656"},
|
||||
"1043": {"name": "机动车燃油", "parent": "665"},
|
||||
"1044": {"name": "食品", "parent": "593"},
|
||||
"1045": {"name": "新能源汽车", "parent": "629"},
|
||||
"1046": {"name": "手机无线网络推广", "parent": "417"},
|
||||
"1047": {"name": "环保设备", "parent": "672"},
|
||||
"1048": {"name": "通讯工程", "parent": "418"},
|
||||
"1049": {"name": "半导体集成电路", "parent": "419"},
|
||||
"1050": {"name": "航空服务", "parent": "636"},
|
||||
"1051": {"name": "电机设备", "parent": "626"},
|
||||
"1052": {"name": "档案软件", "parent": "414"},
|
||||
"1053": {"name": "冷链物流服务", "parent": "634"},
|
||||
"1054": {"name": "小吃服务", "parent": "656"},
|
||||
"1055": {"name": "水产品加工", "parent": "593"},
|
||||
"1056": {"name": "图书出版", "parent": "588"},
|
||||
"1057": {"name": "固体废物治理", "parent": "672"},
|
||||
"1059": {"name": "坚果食品", "parent": "593"},
|
||||
"1060": {"name": "广告传媒", "parent": "579"},
|
||||
"1061": {"name": "电梯", "parent": "622"},
|
||||
"1062": {"name": "社区医疗与卫生院", "parent": "646"},
|
||||
"1063": {"name": "广告、印刷包装", "parent": "630"},
|
||||
"1064": {"name": "婚纱礼服", "parent": "662"},
|
||||
"1065": {"name": "地毯", "parent": "602"},
|
||||
"1066": {"name": "互联网物业", "parent": "551"},
|
||||
"1067": {"name": "跨境电商", "parent": "3"},
|
||||
"1068": {"name": "信息安全、系统集成", "parent": "9"},
|
||||
"1069": {"name": "专用汽车制造", "parent": "750"},
|
||||
"1070": {"name": "商品贸易", "parent": "3"},
|
||||
"1071": {"name": "墙壁装饰材料", "parent": "746"},
|
||||
"1072": {"name": "窗帘装饰材料", "parent": "746"},
|
||||
"1073": {"name": "电子商务、本地生活服务", "parent": "3"},
|
||||
"1075": {"name": "白酒电子商务", "parent": "3"},
|
||||
"1076": {"name": "商品贸易、电子商务", "parent": "3"},
|
||||
"1077": {"name": "木质装饰材料", "parent": "746"},
|
||||
"1078": {"name": "电子商务、汽车电商交易平台", "parent": "3"},
|
||||
"1079": {"name": "汽车轮胎", "parent": "751"},
|
||||
"1080": {"name": "气体压缩机械制造", "parent": "732"},
|
||||
"1081": {"name": "家装家具电子商务", "parent": "3"},
|
||||
"1082": {"name": "化妆品电子商务", "parent": "3"},
|
||||
"1083": {"name": "汽车销售", "parent": "749"},
|
||||
"1084": {"name": "新闻资讯网站", "parent": "510"},
|
||||
"1085": {"name": "母婴电商", "parent": "3"},
|
||||
"1086": {"name": "电商商务、收藏品交易", "parent": "3"},
|
||||
"1088": {"name": "电子商务、数码产品", "parent": "3"},
|
||||
"1089": {"name": "二手车交易", "parent": "749"},
|
||||
"1090": {"name": "游戏制作服务", "parent": "5"},
|
||||
"1091": {"name": "母婴服务", "parent": "510"},
|
||||
"1092": {"name": "家具电子商务", "parent": "3"},
|
||||
"1093": {"name": "汽车配件电子商务", "parent": "3"},
|
||||
"1094": {"name": "输配电设备", "parent": "777"},
|
||||
"1095": {"name": "矿山设备", "parent": "727"},
|
||||
"1096": {"name": "机床机械", "parent": "726"},
|
||||
"1097": {"name": "农产品电商", "parent": "3"},
|
||||
"1098": {"name": "陶瓷装饰材料", "parent": "746"},
|
||||
"1099": {"name": "车载联网设备", "parent": "487"},
|
||||
"1100": {"name": "汽车销售电子商务", "parent": "3"},
|
||||
"1101": {"name": "石油设备", "parent": "730"},
|
||||
"1102": {"name": "智能家居", "parent": "487"},
|
||||
"1103": {"name": "散热器", "parent": "751"},
|
||||
"1104": {"name": "电力工程", "parent": "775"},
|
||||
"1105": {"name": "生鲜电商", "parent": "3"},
|
||||
"1106": {"name": "互联网数据服务", "parent": "490"},
|
||||
"1107": {"name": "房车、商务车销售", "parent": "749"},
|
||||
"1108": {"name": "茶叶电子商务", "parent": "3"},
|
||||
"1109": {"name": "酒类电子商务", "parent": "3"},
|
||||
"1110": {"name": "阀门", "parent": "730"},
|
||||
"1111": {"name": "食品电商", "parent": "3"},
|
||||
"1112": {"name": "儿童摄影", "parent": "871"},
|
||||
"1113": {"name": "广告摄影", "parent": "871"},
|
||||
"1114": {"name": "婚纱摄影", "parent": "871"},
|
||||
"1115": {"name": "模具制造", "parent": "620"},
|
||||
"1116": {"name": "汽车模具", "parent": "629"},
|
||||
"1117": {"name": "认证咨询", "parent": "567"},
|
||||
"1118": {"name": "数字视觉制作服务", "parent": "590"},
|
||||
"1119": {"name": "牙科及医疗器械", "parent": "646"},
|
||||
"1120": {"name": "猎头招聘", "parent": "560"},
|
||||
"1121": {"name": "家居", "parent": "518"},
|
||||
"1122": {"name": "收藏品", "parent": "518"},
|
||||
"1123": {"name": "首饰", "parent": "518"},
|
||||
"1124": {"name": "工艺品", "parent": "518"},
|
||||
"1125": {"name": "财务", "parent": "515"},
|
||||
"1126": {"name": "税务", "parent": "515"},
|
||||
"1127": {"name": "分类信息", "parent": "2"},
|
||||
"1128": {"name": "宠物", "parent": "0"},
|
||||
"1129": {"name": "快消品", "parent": "518"},
|
||||
"1130": {"name": "人工智能", "parent": "2"},
|
||||
"1131": {"name": "农/林/牧/渔", "parent": "0"},
|
||||
}
|
||||
|
||||
|
||||
def get_names(id):
|
||||
id = str(id)
|
||||
nms = []
|
||||
d = TBL.get(id)
|
||||
if not d:
|
||||
return []
|
||||
nms.append(d["name"])
|
||||
p = get_names(d["parent"])
|
||||
if p:
|
||||
nms.extend(p)
|
||||
return nms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(get_names("1119"))
|
||||
789
deepdoc/parser/resume/entities/regions.py
Normal file
789
deepdoc/parser/resume/entities/regions.py
Normal file
@@ -0,0 +1,789 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re
|
||||
|
||||
TBL = {
|
||||
"2": {"name": "北京", "parent": "1"},
|
||||
"3": {"name": "天津", "parent": "1"},
|
||||
"4": {"name": "河北", "parent": "1"},
|
||||
"5": {"name": "山西", "parent": "1"},
|
||||
"6": {"name": "内蒙古", "parent": "1"},
|
||||
"7": {"name": "辽宁", "parent": "1"},
|
||||
"8": {"name": "吉林", "parent": "1"},
|
||||
"9": {"name": "黑龙江", "parent": "1"},
|
||||
"10": {"name": "上海", "parent": "1"},
|
||||
"11": {"name": "江苏", "parent": "1"},
|
||||
"12": {"name": "浙江", "parent": "1"},
|
||||
"13": {"name": "安徽", "parent": "1"},
|
||||
"14": {"name": "福建", "parent": "1"},
|
||||
"15": {"name": "江西", "parent": "1"},
|
||||
"16": {"name": "山东", "parent": "1"},
|
||||
"17": {"name": "河南", "parent": "1"},
|
||||
"18": {"name": "湖北", "parent": "1"},
|
||||
"19": {"name": "湖南", "parent": "1"},
|
||||
"20": {"name": "广东", "parent": "1"},
|
||||
"21": {"name": "广西", "parent": "1"},
|
||||
"22": {"name": "海南", "parent": "1"},
|
||||
"23": {"name": "重庆", "parent": "1"},
|
||||
"24": {"name": "四川", "parent": "1"},
|
||||
"25": {"name": "贵州", "parent": "1"},
|
||||
"26": {"name": "云南", "parent": "1"},
|
||||
"27": {"name": "西藏", "parent": "1"},
|
||||
"28": {"name": "陕西", "parent": "1"},
|
||||
"29": {"name": "甘肃", "parent": "1"},
|
||||
"30": {"name": "青海", "parent": "1"},
|
||||
"31": {"name": "宁夏", "parent": "1"},
|
||||
"32": {"name": "新疆", "parent": "1"},
|
||||
"33": {"name": "北京市", "parent": "2"},
|
||||
"34": {"name": "天津市", "parent": "3"},
|
||||
"35": {"name": "石家庄市", "parent": "4"},
|
||||
"36": {"name": "唐山市", "parent": "4"},
|
||||
"37": {"name": "秦皇岛市", "parent": "4"},
|
||||
"38": {"name": "邯郸市", "parent": "4"},
|
||||
"39": {"name": "邢台市", "parent": "4"},
|
||||
"40": {"name": "保定市", "parent": "4"},
|
||||
"41": {"name": "张家口市", "parent": "4"},
|
||||
"42": {"name": "承德市", "parent": "4"},
|
||||
"43": {"name": "沧州市", "parent": "4"},
|
||||
"44": {"name": "廊坊市", "parent": "4"},
|
||||
"45": {"name": "衡水市", "parent": "4"},
|
||||
"46": {"name": "太原市", "parent": "5"},
|
||||
"47": {"name": "大同市", "parent": "5"},
|
||||
"48": {"name": "阳泉市", "parent": "5"},
|
||||
"49": {"name": "长治市", "parent": "5"},
|
||||
"50": {"name": "晋城市", "parent": "5"},
|
||||
"51": {"name": "朔州市", "parent": "5"},
|
||||
"52": {"name": "晋中市", "parent": "5"},
|
||||
"53": {"name": "运城市", "parent": "5"},
|
||||
"54": {"name": "忻州市", "parent": "5"},
|
||||
"55": {"name": "临汾市", "parent": "5"},
|
||||
"56": {"name": "吕梁市", "parent": "5"},
|
||||
"57": {"name": "呼和浩特市", "parent": "6"},
|
||||
"58": {"name": "包头市", "parent": "6"},
|
||||
"59": {"name": "乌海市", "parent": "6"},
|
||||
"60": {"name": "赤峰市", "parent": "6"},
|
||||
"61": {"name": "通辽市", "parent": "6"},
|
||||
"62": {"name": "鄂尔多斯市", "parent": "6"},
|
||||
"63": {"name": "呼伦贝尔市", "parent": "6"},
|
||||
"64": {"name": "巴彦淖尔市", "parent": "6"},
|
||||
"65": {"name": "乌兰察布市", "parent": "6"},
|
||||
"66": {"name": "兴安盟", "parent": "6"},
|
||||
"67": {"name": "锡林郭勒盟", "parent": "6"},
|
||||
"68": {"name": "阿拉善盟", "parent": "6"},
|
||||
"69": {"name": "沈阳市", "parent": "7"},
|
||||
"70": {"name": "大连市", "parent": "7"},
|
||||
"71": {"name": "鞍山市", "parent": "7"},
|
||||
"72": {"name": "抚顺市", "parent": "7"},
|
||||
"73": {"name": "本溪市", "parent": "7"},
|
||||
"74": {"name": "丹东市", "parent": "7"},
|
||||
"75": {"name": "锦州市", "parent": "7"},
|
||||
"76": {"name": "营口市", "parent": "7"},
|
||||
"77": {"name": "阜新市", "parent": "7"},
|
||||
"78": {"name": "辽阳市", "parent": "7"},
|
||||
"79": {"name": "盘锦市", "parent": "7"},
|
||||
"80": {"name": "铁岭市", "parent": "7"},
|
||||
"81": {"name": "朝阳市", "parent": "7"},
|
||||
"82": {"name": "葫芦岛市", "parent": "7"},
|
||||
"83": {"name": "长春市", "parent": "8"},
|
||||
"84": {"name": "吉林市", "parent": "8"},
|
||||
"85": {"name": "四平市", "parent": "8"},
|
||||
"86": {"name": "辽源市", "parent": "8"},
|
||||
"87": {"name": "通化市", "parent": "8"},
|
||||
"88": {"name": "白山市", "parent": "8"},
|
||||
"89": {"name": "松原市", "parent": "8"},
|
||||
"90": {"name": "白城市", "parent": "8"},
|
||||
"91": {"name": "延边朝鲜族自治州", "parent": "8"},
|
||||
"92": {"name": "哈尔滨市", "parent": "9"},
|
||||
"93": {"name": "齐齐哈尔市", "parent": "9"},
|
||||
"94": {"name": "鸡西市", "parent": "9"},
|
||||
"95": {"name": "鹤岗市", "parent": "9"},
|
||||
"96": {"name": "双鸭山市", "parent": "9"},
|
||||
"97": {"name": "大庆市", "parent": "9"},
|
||||
"98": {"name": "伊春市", "parent": "9"},
|
||||
"99": {"name": "佳木斯市", "parent": "9"},
|
||||
"100": {"name": "七台河市", "parent": "9"},
|
||||
"101": {"name": "牡丹江市", "parent": "9"},
|
||||
"102": {"name": "黑河市", "parent": "9"},
|
||||
"103": {"name": "绥化市", "parent": "9"},
|
||||
"104": {"name": "大兴安岭地区", "parent": "9"},
|
||||
"105": {"name": "上海市", "parent": "10"},
|
||||
"106": {"name": "南京市", "parent": "11"},
|
||||
"107": {"name": "无锡市", "parent": "11"},
|
||||
"108": {"name": "徐州市", "parent": "11"},
|
||||
"109": {"name": "常州市", "parent": "11"},
|
||||
"110": {"name": "苏州市", "parent": "11"},
|
||||
"111": {"name": "南通市", "parent": "11"},
|
||||
"112": {"name": "连云港市", "parent": "11"},
|
||||
"113": {"name": "淮安市", "parent": "11"},
|
||||
"114": {"name": "盐城市", "parent": "11"},
|
||||
"115": {"name": "扬州市", "parent": "11"},
|
||||
"116": {"name": "镇江市", "parent": "11"},
|
||||
"117": {"name": "泰州市", "parent": "11"},
|
||||
"118": {"name": "宿迁市", "parent": "11"},
|
||||
"119": {"name": "杭州市", "parent": "12"},
|
||||
"120": {"name": "宁波市", "parent": "12"},
|
||||
"121": {"name": "温州市", "parent": "12"},
|
||||
"122": {"name": "嘉兴市", "parent": "12"},
|
||||
"123": {"name": "湖州市", "parent": "12"},
|
||||
"124": {"name": "绍兴市", "parent": "12"},
|
||||
"125": {"name": "金华市", "parent": "12"},
|
||||
"126": {"name": "衢州市", "parent": "12"},
|
||||
"127": {"name": "舟山市", "parent": "12"},
|
||||
"128": {"name": "台州市", "parent": "12"},
|
||||
"129": {"name": "丽水市", "parent": "12"},
|
||||
"130": {"name": "合肥市", "parent": "13"},
|
||||
"131": {"name": "芜湖市", "parent": "13"},
|
||||
"132": {"name": "蚌埠市", "parent": "13"},
|
||||
"133": {"name": "淮南市", "parent": "13"},
|
||||
"134": {"name": "马鞍山市", "parent": "13"},
|
||||
"135": {"name": "淮北市", "parent": "13"},
|
||||
"136": {"name": "铜陵市", "parent": "13"},
|
||||
"137": {"name": "安庆市", "parent": "13"},
|
||||
"138": {"name": "黄山市", "parent": "13"},
|
||||
"139": {"name": "滁州市", "parent": "13"},
|
||||
"140": {"name": "阜阳市", "parent": "13"},
|
||||
"141": {"name": "宿州市", "parent": "13"},
|
||||
"143": {"name": "六安市", "parent": "13"},
|
||||
"144": {"name": "亳州市", "parent": "13"},
|
||||
"145": {"name": "池州市", "parent": "13"},
|
||||
"146": {"name": "宣城市", "parent": "13"},
|
||||
"147": {"name": "福州市", "parent": "14"},
|
||||
"148": {"name": "厦门市", "parent": "14"},
|
||||
"149": {"name": "莆田市", "parent": "14"},
|
||||
"150": {"name": "三明市", "parent": "14"},
|
||||
"151": {"name": "泉州市", "parent": "14"},
|
||||
"152": {"name": "漳州市", "parent": "14"},
|
||||
"153": {"name": "南平市", "parent": "14"},
|
||||
"154": {"name": "龙岩市", "parent": "14"},
|
||||
"155": {"name": "宁德市", "parent": "14"},
|
||||
"156": {"name": "南昌市", "parent": "15"},
|
||||
"157": {"name": "景德镇市", "parent": "15"},
|
||||
"158": {"name": "萍乡市", "parent": "15"},
|
||||
"159": {"name": "九江市", "parent": "15"},
|
||||
"160": {"name": "新余市", "parent": "15"},
|
||||
"161": {"name": "鹰潭市", "parent": "15"},
|
||||
"162": {"name": "赣州市", "parent": "15"},
|
||||
"163": {"name": "吉安市", "parent": "15"},
|
||||
"164": {"name": "宜春市", "parent": "15"},
|
||||
"165": {"name": "抚州市", "parent": "15"},
|
||||
"166": {"name": "上饶市", "parent": "15"},
|
||||
"167": {"name": "济南市", "parent": "16"},
|
||||
"168": {"name": "青岛市", "parent": "16"},
|
||||
"169": {"name": "淄博市", "parent": "16"},
|
||||
"170": {"name": "枣庄市", "parent": "16"},
|
||||
"171": {"name": "东营市", "parent": "16"},
|
||||
"172": {"name": "烟台市", "parent": "16"},
|
||||
"173": {"name": "潍坊市", "parent": "16"},
|
||||
"174": {"name": "济宁市", "parent": "16"},
|
||||
"175": {"name": "泰安市", "parent": "16"},
|
||||
"176": {"name": "威海市", "parent": "16"},
|
||||
"177": {"name": "日照市", "parent": "16"},
|
||||
"179": {"name": "临沂市", "parent": "16"},
|
||||
"180": {"name": "德州市", "parent": "16"},
|
||||
"181": {"name": "聊城市", "parent": "16"},
|
||||
"182": {"name": "滨州市", "parent": "16"},
|
||||
"183": {"name": "菏泽市", "parent": "16"},
|
||||
"184": {"name": "郑州市", "parent": "17"},
|
||||
"185": {"name": "开封市", "parent": "17"},
|
||||
"186": {"name": "洛阳市", "parent": "17"},
|
||||
"187": {"name": "平顶山市", "parent": "17"},
|
||||
"188": {"name": "安阳市", "parent": "17"},
|
||||
"189": {"name": "鹤壁市", "parent": "17"},
|
||||
"190": {"name": "新乡市", "parent": "17"},
|
||||
"191": {"name": "焦作市", "parent": "17"},
|
||||
"192": {"name": "濮阳市", "parent": "17"},
|
||||
"193": {"name": "许昌市", "parent": "17"},
|
||||
"194": {"name": "漯河市", "parent": "17"},
|
||||
"195": {"name": "三门峡市", "parent": "17"},
|
||||
"196": {"name": "南阳市", "parent": "17"},
|
||||
"197": {"name": "商丘市", "parent": "17"},
|
||||
"198": {"name": "信阳市", "parent": "17"},
|
||||
"199": {"name": "周口市", "parent": "17"},
|
||||
"200": {"name": "驻马店市", "parent": "17"},
|
||||
"201": {"name": "武汉市", "parent": "18"},
|
||||
"202": {"name": "黄石市", "parent": "18"},
|
||||
"203": {"name": "十堰市", "parent": "18"},
|
||||
"204": {"name": "宜昌市", "parent": "18"},
|
||||
"205": {"name": "襄阳市", "parent": "18"},
|
||||
"206": {"name": "鄂州市", "parent": "18"},
|
||||
"207": {"name": "荆门市", "parent": "18"},
|
||||
"208": {"name": "孝感市", "parent": "18"},
|
||||
"209": {"name": "荆州市", "parent": "18"},
|
||||
"210": {"name": "黄冈市", "parent": "18"},
|
||||
"211": {"name": "咸宁市", "parent": "18"},
|
||||
"212": {"name": "随州市", "parent": "18"},
|
||||
"213": {"name": "恩施土家族苗族自治州", "parent": "18"},
|
||||
"215": {"name": "长沙市", "parent": "19"},
|
||||
"216": {"name": "株洲市", "parent": "19"},
|
||||
"217": {"name": "湘潭市", "parent": "19"},
|
||||
"218": {"name": "衡阳市", "parent": "19"},
|
||||
"219": {"name": "邵阳市", "parent": "19"},
|
||||
"220": {"name": "岳阳市", "parent": "19"},
|
||||
"221": {"name": "常德市", "parent": "19"},
|
||||
"222": {"name": "张家界市", "parent": "19"},
|
||||
"223": {"name": "益阳市", "parent": "19"},
|
||||
"224": {"name": "郴州市", "parent": "19"},
|
||||
"225": {"name": "永州市", "parent": "19"},
|
||||
"226": {"name": "怀化市", "parent": "19"},
|
||||
"227": {"name": "娄底市", "parent": "19"},
|
||||
"228": {"name": "湘西土家族苗族自治州", "parent": "19"},
|
||||
"229": {"name": "广州市", "parent": "20"},
|
||||
"230": {"name": "韶关市", "parent": "20"},
|
||||
"231": {"name": "深圳市", "parent": "20"},
|
||||
"232": {"name": "珠海市", "parent": "20"},
|
||||
"233": {"name": "汕头市", "parent": "20"},
|
||||
"234": {"name": "佛山市", "parent": "20"},
|
||||
"235": {"name": "江门市", "parent": "20"},
|
||||
"236": {"name": "湛江市", "parent": "20"},
|
||||
"237": {"name": "茂名市", "parent": "20"},
|
||||
"238": {"name": "肇庆市", "parent": "20"},
|
||||
"239": {"name": "惠州市", "parent": "20"},
|
||||
"240": {"name": "梅州市", "parent": "20"},
|
||||
"241": {"name": "汕尾市", "parent": "20"},
|
||||
"242": {"name": "河源市", "parent": "20"},
|
||||
"243": {"name": "阳江市", "parent": "20"},
|
||||
"244": {"name": "清远市", "parent": "20"},
|
||||
"245": {"name": "东莞市", "parent": "20"},
|
||||
"246": {"name": "中山市", "parent": "20"},
|
||||
"247": {"name": "潮州市", "parent": "20"},
|
||||
"248": {"name": "揭阳市", "parent": "20"},
|
||||
"249": {"name": "云浮市", "parent": "20"},
|
||||
"250": {"name": "南宁市", "parent": "21"},
|
||||
"251": {"name": "柳州市", "parent": "21"},
|
||||
"252": {"name": "桂林市", "parent": "21"},
|
||||
"253": {"name": "梧州市", "parent": "21"},
|
||||
"254": {"name": "北海市", "parent": "21"},
|
||||
"255": {"name": "防城港市", "parent": "21"},
|
||||
"256": {"name": "钦州市", "parent": "21"},
|
||||
"257": {"name": "贵港市", "parent": "21"},
|
||||
"258": {"name": "玉林市", "parent": "21"},
|
||||
"259": {"name": "百色市", "parent": "21"},
|
||||
"260": {"name": "贺州市", "parent": "21"},
|
||||
"261": {"name": "河池市", "parent": "21"},
|
||||
"262": {"name": "来宾市", "parent": "21"},
|
||||
"263": {"name": "崇左市", "parent": "21"},
|
||||
"264": {"name": "海口市", "parent": "22"},
|
||||
"265": {"name": "三亚市", "parent": "22"},
|
||||
"267": {"name": "重庆市", "parent": "23"},
|
||||
"268": {"name": "成都市", "parent": "24"},
|
||||
"269": {"name": "自贡市", "parent": "24"},
|
||||
"270": {"name": "攀枝花市", "parent": "24"},
|
||||
"271": {"name": "泸州市", "parent": "24"},
|
||||
"272": {"name": "德阳市", "parent": "24"},
|
||||
"273": {"name": "绵阳市", "parent": "24"},
|
||||
"274": {"name": "广元市", "parent": "24"},
|
||||
"275": {"name": "遂宁市", "parent": "24"},
|
||||
"276": {"name": "内江市", "parent": "24"},
|
||||
"277": {"name": "乐山市", "parent": "24"},
|
||||
"278": {"name": "南充市", "parent": "24"},
|
||||
"279": {"name": "眉山市", "parent": "24"},
|
||||
"280": {"name": "宜宾市", "parent": "24"},
|
||||
"281": {"name": "广安市", "parent": "24"},
|
||||
"282": {"name": "达州市", "parent": "24"},
|
||||
"283": {"name": "雅安市", "parent": "24"},
|
||||
"284": {"name": "巴中市", "parent": "24"},
|
||||
"285": {"name": "资阳市", "parent": "24"},
|
||||
"286": {"name": "阿坝藏族羌族自治州", "parent": "24"},
|
||||
"287": {"name": "甘孜藏族自治州", "parent": "24"},
|
||||
"288": {"name": "凉山彝族自治州", "parent": "24"},
|
||||
"289": {"name": "贵阳市", "parent": "25"},
|
||||
"290": {"name": "六盘水市", "parent": "25"},
|
||||
"291": {"name": "遵义市", "parent": "25"},
|
||||
"292": {"name": "安顺市", "parent": "25"},
|
||||
"293": {"name": "铜仁市", "parent": "25"},
|
||||
"294": {"name": "黔西南布依族苗族自治州", "parent": "25"},
|
||||
"295": {"name": "毕节市", "parent": "25"},
|
||||
"296": {"name": "黔东南苗族侗族自治州", "parent": "25"},
|
||||
"297": {"name": "黔南布依族苗族自治州", "parent": "25"},
|
||||
"298": {"name": "昆明市", "parent": "26"},
|
||||
"299": {"name": "曲靖市", "parent": "26"},
|
||||
"300": {"name": "玉溪市", "parent": "26"},
|
||||
"301": {"name": "保山市", "parent": "26"},
|
||||
"302": {"name": "昭通市", "parent": "26"},
|
||||
"303": {"name": "丽江市", "parent": "26"},
|
||||
"304": {"name": "普洱市", "parent": "26"},
|
||||
"305": {"name": "临沧市", "parent": "26"},
|
||||
"306": {"name": "楚雄彝族自治州", "parent": "26"},
|
||||
"307": {"name": "红河哈尼族彝族自治州", "parent": "26"},
|
||||
"308": {"name": "文山壮族苗族自治州", "parent": "26"},
|
||||
"309": {"name": "西双版纳傣族自治州", "parent": "26"},
|
||||
"310": {"name": "大理白族自治州", "parent": "26"},
|
||||
"311": {"name": "德宏傣族景颇族自治州", "parent": "26"},
|
||||
"312": {"name": "怒江傈僳族自治州", "parent": "26"},
|
||||
"313": {"name": "迪庆藏族自治州", "parent": "26"},
|
||||
"314": {"name": "拉萨市", "parent": "27"},
|
||||
"315": {"name": "昌都市", "parent": "27"},
|
||||
"316": {"name": "山南市", "parent": "27"},
|
||||
"317": {"name": "日喀则市", "parent": "27"},
|
||||
"318": {"name": "那曲市", "parent": "27"},
|
||||
"319": {"name": "阿里地区", "parent": "27"},
|
||||
"320": {"name": "林芝市", "parent": "27"},
|
||||
"321": {"name": "西安市", "parent": "28"},
|
||||
"322": {"name": "铜川市", "parent": "28"},
|
||||
"323": {"name": "宝鸡市", "parent": "28"},
|
||||
"324": {"name": "咸阳市", "parent": "28"},
|
||||
"325": {"name": "渭南市", "parent": "28"},
|
||||
"326": {"name": "延安市", "parent": "28"},
|
||||
"327": {"name": "汉中市", "parent": "28"},
|
||||
"328": {"name": "榆林市", "parent": "28"},
|
||||
"329": {"name": "安康市", "parent": "28"},
|
||||
"330": {"name": "商洛市", "parent": "28"},
|
||||
"331": {"name": "兰州市", "parent": "29"},
|
||||
"332": {"name": "嘉峪关市", "parent": "29"},
|
||||
"333": {"name": "金昌市", "parent": "29"},
|
||||
"334": {"name": "白银市", "parent": "29"},
|
||||
"335": {"name": "天水市", "parent": "29"},
|
||||
"336": {"name": "武威市", "parent": "29"},
|
||||
"337": {"name": "张掖市", "parent": "29"},
|
||||
"338": {"name": "平凉市", "parent": "29"},
|
||||
"339": {"name": "酒泉市", "parent": "29"},
|
||||
"340": {"name": "庆阳市", "parent": "29"},
|
||||
"341": {"name": "定西市", "parent": "29"},
|
||||
"342": {"name": "陇南市", "parent": "29"},
|
||||
"343": {"name": "临夏回族自治州", "parent": "29"},
|
||||
"344": {"name": "甘南藏族自治州", "parent": "29"},
|
||||
"345": {"name": "西宁市", "parent": "30"},
|
||||
"346": {"name": "海东市", "parent": "30"},
|
||||
"347": {"name": "海北藏族自治州", "parent": "30"},
|
||||
"348": {"name": "黄南藏族自治州", "parent": "30"},
|
||||
"349": {"name": "海南藏族自治州", "parent": "30"},
|
||||
"350": {"name": "果洛藏族自治州", "parent": "30"},
|
||||
"351": {"name": "玉树藏族自治州", "parent": "30"},
|
||||
"352": {"name": "海西蒙古族藏族自治州", "parent": "30"},
|
||||
"353": {"name": "银川市", "parent": "31"},
|
||||
"354": {"name": "石嘴山市", "parent": "31"},
|
||||
"355": {"name": "吴忠市", "parent": "31"},
|
||||
"356": {"name": "固原市", "parent": "31"},
|
||||
"357": {"name": "中卫市", "parent": "31"},
|
||||
"358": {"name": "乌鲁木齐市", "parent": "32"},
|
||||
"359": {"name": "克拉玛依市", "parent": "32"},
|
||||
"360": {"name": "吐鲁番市", "parent": "32"},
|
||||
"361": {"name": "哈密市", "parent": "32"},
|
||||
"362": {"name": "昌吉回族自治州", "parent": "32"},
|
||||
"363": {"name": "博尔塔拉蒙古自治州", "parent": "32"},
|
||||
"364": {"name": "巴音郭楞蒙古自治州", "parent": "32"},
|
||||
"365": {"name": "阿克苏地区", "parent": "32"},
|
||||
"366": {"name": "克孜勒苏柯尔克孜自治州", "parent": "32"},
|
||||
"367": {"name": "喀什地区", "parent": "32"},
|
||||
"368": {"name": "和田地区", "parent": "32"},
|
||||
"369": {"name": "伊犁哈萨克自治州", "parent": "32"},
|
||||
"370": {"name": "塔城地区", "parent": "32"},
|
||||
"371": {"name": "阿勒泰地区", "parent": "32"},
|
||||
"372": {"name": "新疆省直辖行政单位", "parent": "32"},
|
||||
"373": {"name": "可克达拉市", "parent": "32"},
|
||||
"374": {"name": "昆玉市", "parent": "32"},
|
||||
"375": {"name": "胡杨河市", "parent": "32"},
|
||||
"376": {"name": "双河市", "parent": "32"},
|
||||
"3560": {"name": "北票市", "parent": "7"},
|
||||
"3615": {"name": "高州市", "parent": "20"},
|
||||
"3651": {"name": "济源市", "parent": "17"},
|
||||
"3662": {"name": "胶南市", "parent": "16"},
|
||||
"3683": {"name": "老河口市", "parent": "18"},
|
||||
"3758": {"name": "沙河市", "parent": "4"},
|
||||
"3822": {"name": "宜城市", "parent": "18"},
|
||||
"3842": {"name": "枣阳市", "parent": "18"},
|
||||
"3850": {"name": "肇东市", "parent": "9"},
|
||||
"3905": {"name": "澳门", "parent": "1"},
|
||||
"3906": {"name": "澳门", "parent": "3905"},
|
||||
"3907": {"name": "香港", "parent": "1"},
|
||||
"3908": {"name": "香港", "parent": "3907"},
|
||||
"3947": {"name": "仙桃市", "parent": "18"},
|
||||
"3954": {"name": "台湾", "parent": "1"},
|
||||
"3955": {"name": "台湾", "parent": "3954"},
|
||||
"3956": {"name": "海外", "parent": "1"},
|
||||
"3957": {"name": "海外", "parent": "3956"},
|
||||
"3958": {"name": "美国", "parent": "3956"},
|
||||
"3959": {"name": "加拿大", "parent": "3956"},
|
||||
"3961": {"name": "日本", "parent": "3956"},
|
||||
"3962": {"name": "韩国", "parent": "3956"},
|
||||
"3963": {"name": "德国", "parent": "3956"},
|
||||
"3964": {"name": "英国", "parent": "3956"},
|
||||
"3965": {"name": "意大利", "parent": "3956"},
|
||||
"3966": {"name": "西班牙", "parent": "3956"},
|
||||
"3967": {"name": "法国", "parent": "3956"},
|
||||
"3968": {"name": "澳大利亚", "parent": "3956"},
|
||||
"3969": {"name": "东城区", "parent": "2"},
|
||||
"3970": {"name": "西城区", "parent": "2"},
|
||||
"3971": {"name": "崇文区", "parent": "2"},
|
||||
"3972": {"name": "宣武区", "parent": "2"},
|
||||
"3973": {"name": "朝阳区", "parent": "2"},
|
||||
"3974": {"name": "海淀区", "parent": "2"},
|
||||
"3975": {"name": "丰台区", "parent": "2"},
|
||||
"3976": {"name": "石景山区", "parent": "2"},
|
||||
"3977": {"name": "门头沟区", "parent": "2"},
|
||||
"3978": {"name": "房山区", "parent": "2"},
|
||||
"3979": {"name": "通州区", "parent": "2"},
|
||||
"3980": {"name": "顺义区", "parent": "2"},
|
||||
"3981": {"name": "昌平区", "parent": "2"},
|
||||
"3982": {"name": "大兴区", "parent": "2"},
|
||||
"3983": {"name": "平谷区", "parent": "2"},
|
||||
"3984": {"name": "怀柔区", "parent": "2"},
|
||||
"3985": {"name": "密云区", "parent": "2"},
|
||||
"3986": {"name": "延庆区", "parent": "2"},
|
||||
"3987": {"name": "黄浦区", "parent": "10"},
|
||||
"3988": {"name": "徐汇区", "parent": "10"},
|
||||
"3989": {"name": "长宁区", "parent": "10"},
|
||||
"3990": {"name": "静安区", "parent": "10"},
|
||||
"3991": {"name": "普陀区", "parent": "10"},
|
||||
"3992": {"name": "闸北区", "parent": "10"},
|
||||
"3993": {"name": "虹口区", "parent": "10"},
|
||||
"3994": {"name": "杨浦区", "parent": "10"},
|
||||
"3995": {"name": "宝山区", "parent": "10"},
|
||||
"3996": {"name": "闵行区", "parent": "10"},
|
||||
"3997": {"name": "嘉定区", "parent": "10"},
|
||||
"3998": {"name": "浦东新区", "parent": "10"},
|
||||
"3999": {"name": "松江区", "parent": "10"},
|
||||
"4000": {"name": "金山区", "parent": "10"},
|
||||
"4001": {"name": "青浦区", "parent": "10"},
|
||||
"4002": {"name": "奉贤区", "parent": "10"},
|
||||
"4003": {"name": "崇明区", "parent": "10"},
|
||||
"4004": {"name": "和平区", "parent": "3"},
|
||||
"4005": {"name": "河东区", "parent": "3"},
|
||||
"4006": {"name": "河西区", "parent": "3"},
|
||||
"4007": {"name": "南开区", "parent": "3"},
|
||||
"4008": {"name": "红桥区", "parent": "3"},
|
||||
"4009": {"name": "河北区", "parent": "3"},
|
||||
"4010": {"name": "滨海新区", "parent": "3"},
|
||||
"4011": {"name": "东丽区", "parent": "3"},
|
||||
"4012": {"name": "西青区", "parent": "3"},
|
||||
"4013": {"name": "北辰区", "parent": "3"},
|
||||
"4014": {"name": "津南区", "parent": "3"},
|
||||
"4015": {"name": "武清区", "parent": "3"},
|
||||
"4016": {"name": "宝坻区", "parent": "3"},
|
||||
"4017": {"name": "静海区", "parent": "3"},
|
||||
"4018": {"name": "宁河区", "parent": "3"},
|
||||
"4019": {"name": "蓟州区", "parent": "3"},
|
||||
"4020": {"name": "渝中区", "parent": "23"},
|
||||
"4021": {"name": "江北区", "parent": "23"},
|
||||
"4022": {"name": "南岸区", "parent": "23"},
|
||||
"4023": {"name": "沙坪坝区", "parent": "23"},
|
||||
"4024": {"name": "九龙坡区", "parent": "23"},
|
||||
"4025": {"name": "大渡口区", "parent": "23"},
|
||||
"4026": {"name": "渝北区", "parent": "23"},
|
||||
"4027": {"name": "巴南区", "parent": "23"},
|
||||
"4028": {"name": "北碚区", "parent": "23"},
|
||||
"4029": {"name": "万州区", "parent": "23"},
|
||||
"4030": {"name": "黔江区", "parent": "23"},
|
||||
"4031": {"name": "永川区", "parent": "23"},
|
||||
"4032": {"name": "涪陵区", "parent": "23"},
|
||||
"4033": {"name": "江津区", "parent": "23"},
|
||||
"4034": {"name": "合川区", "parent": "23"},
|
||||
"4035": {"name": "双桥区", "parent": "23"},
|
||||
"4036": {"name": "万盛区", "parent": "23"},
|
||||
"4037": {"name": "荣昌区", "parent": "23"},
|
||||
"4038": {"name": "大足区", "parent": "23"},
|
||||
"4039": {"name": "璧山区", "parent": "23"},
|
||||
"4040": {"name": "铜梁区", "parent": "23"},
|
||||
"4041": {"name": "潼南区", "parent": "23"},
|
||||
"4042": {"name": "綦江区", "parent": "23"},
|
||||
"4043": {"name": "忠县", "parent": "23"},
|
||||
"4044": {"name": "开州区", "parent": "23"},
|
||||
"4045": {"name": "云阳县", "parent": "23"},
|
||||
"4046": {"name": "梁平区", "parent": "23"},
|
||||
"4047": {"name": "垫江县", "parent": "23"},
|
||||
"4048": {"name": "丰都县", "parent": "23"},
|
||||
"4049": {"name": "奉节县", "parent": "23"},
|
||||
"4050": {"name": "巫山县", "parent": "23"},
|
||||
"4051": {"name": "巫溪县", "parent": "23"},
|
||||
"4052": {"name": "城口县", "parent": "23"},
|
||||
"4053": {"name": "武隆区", "parent": "23"},
|
||||
"4054": {"name": "石柱土家族自治县", "parent": "23"},
|
||||
"4055": {"name": "秀山土家族苗族自治县", "parent": "23"},
|
||||
"4056": {"name": "酉阳土家族苗族自治县", "parent": "23"},
|
||||
"4057": {"name": "彭水苗族土家族自治县", "parent": "23"},
|
||||
"4058": {"name": "潜江市", "parent": "18"},
|
||||
"4059": {"name": "三沙市", "parent": "22"},
|
||||
"4060": {"name": "石河子市", "parent": "32"},
|
||||
"4061": {"name": "阿拉尔市", "parent": "32"},
|
||||
"4062": {"name": "图木舒克市", "parent": "32"},
|
||||
"4063": {"name": "五家渠市", "parent": "32"},
|
||||
"4064": {"name": "北屯市", "parent": "32"},
|
||||
"4065": {"name": "铁门关市", "parent": "32"},
|
||||
"4066": {"name": "儋州市", "parent": "22"},
|
||||
"4067": {"name": "五指山市", "parent": "22"},
|
||||
"4068": {"name": "文昌市", "parent": "22"},
|
||||
"4069": {"name": "琼海市", "parent": "22"},
|
||||
"4070": {"name": "万宁市", "parent": "22"},
|
||||
"4072": {"name": "定安县", "parent": "22"},
|
||||
"4073": {"name": "屯昌县", "parent": "22"},
|
||||
"4074": {"name": "澄迈县", "parent": "22"},
|
||||
"4075": {"name": "临高县", "parent": "22"},
|
||||
"4076": {"name": "琼中黎族苗族自治县", "parent": "22"},
|
||||
"4077": {"name": "保亭黎族苗族自治县", "parent": "22"},
|
||||
"4078": {"name": "白沙黎族自治县", "parent": "22"},
|
||||
"4079": {"name": "昌江黎族自治县", "parent": "22"},
|
||||
"4080": {"name": "乐东黎族自治县", "parent": "22"},
|
||||
"4081": {"name": "陵水黎族自治县", "parent": "22"},
|
||||
"4082": {"name": "马来西亚", "parent": "3956"},
|
||||
"6047": {"name": "长寿区", "parent": "23"},
|
||||
"6857": {"name": "阿富汗", "parent": "3956"},
|
||||
"6858": {"name": "阿尔巴尼亚", "parent": "3956"},
|
||||
"6859": {"name": "阿尔及利亚", "parent": "3956"},
|
||||
"6860": {"name": "美属萨摩亚", "parent": "3956"},
|
||||
"6861": {"name": "安道尔", "parent": "3956"},
|
||||
"6862": {"name": "安哥拉", "parent": "3956"},
|
||||
"6863": {"name": "安圭拉", "parent": "3956"},
|
||||
"6864": {"name": "南极洲", "parent": "3956"},
|
||||
"6865": {"name": "安提瓜和巴布达", "parent": "3956"},
|
||||
"6866": {"name": "阿根廷", "parent": "3956"},
|
||||
"6867": {"name": "亚美尼亚", "parent": "3956"},
|
||||
"6869": {"name": "奥地利", "parent": "3956"},
|
||||
"6870": {"name": "阿塞拜疆", "parent": "3956"},
|
||||
"6871": {"name": "巴哈马", "parent": "3956"},
|
||||
"6872": {"name": "巴林", "parent": "3956"},
|
||||
"6873": {"name": "孟加拉国", "parent": "3956"},
|
||||
"6874": {"name": "巴巴多斯", "parent": "3956"},
|
||||
"6875": {"name": "白俄罗斯", "parent": "3956"},
|
||||
"6876": {"name": "比利时", "parent": "3956"},
|
||||
"6877": {"name": "伯利兹", "parent": "3956"},
|
||||
"6878": {"name": "贝宁", "parent": "3956"},
|
||||
"6879": {"name": "百慕大", "parent": "3956"},
|
||||
"6880": {"name": "不丹", "parent": "3956"},
|
||||
"6881": {"name": "玻利维亚", "parent": "3956"},
|
||||
"6882": {"name": "波黑", "parent": "3956"},
|
||||
"6883": {"name": "博茨瓦纳", "parent": "3956"},
|
||||
"6884": {"name": "布维岛", "parent": "3956"},
|
||||
"6885": {"name": "巴西", "parent": "3956"},
|
||||
"6886": {"name": "英属印度洋领土", "parent": "3956"},
|
||||
"6887": {"name": "文莱", "parent": "3956"},
|
||||
"6888": {"name": "保加利亚", "parent": "3956"},
|
||||
"6889": {"name": "布基纳法索", "parent": "3956"},
|
||||
"6890": {"name": "布隆迪", "parent": "3956"},
|
||||
"6891": {"name": "柬埔寨", "parent": "3956"},
|
||||
"6892": {"name": "喀麦隆", "parent": "3956"},
|
||||
"6893": {"name": "佛得角", "parent": "3956"},
|
||||
"6894": {"name": "开曼群岛", "parent": "3956"},
|
||||
"6895": {"name": "中非", "parent": "3956"},
|
||||
"6896": {"name": "乍得", "parent": "3956"},
|
||||
"6897": {"name": "智利", "parent": "3956"},
|
||||
"6898": {"name": "圣诞岛", "parent": "3956"},
|
||||
"6899": {"name": "科科斯(基林)群岛", "parent": "3956"},
|
||||
"6900": {"name": "哥伦比亚", "parent": "3956"},
|
||||
"6901": {"name": "科摩罗", "parent": "3956"},
|
||||
"6902": {"name": "刚果(布)", "parent": "3956"},
|
||||
"6903": {"name": "刚果(金)", "parent": "3956"},
|
||||
"6904": {"name": "库克群岛", "parent": "3956"},
|
||||
"6905": {"name": "哥斯达黎加", "parent": "3956"},
|
||||
"6906": {"name": "科特迪瓦", "parent": "3956"},
|
||||
"6907": {"name": "克罗地亚", "parent": "3956"},
|
||||
"6908": {"name": "古巴", "parent": "3956"},
|
||||
"6909": {"name": "塞浦路斯", "parent": "3956"},
|
||||
"6910": {"name": "捷克", "parent": "3956"},
|
||||
"6911": {"name": "丹麦", "parent": "3956"},
|
||||
"6912": {"name": "吉布提", "parent": "3956"},
|
||||
"6913": {"name": "多米尼克", "parent": "3956"},
|
||||
"6914": {"name": "多米尼加共和国", "parent": "3956"},
|
||||
"6915": {"name": "东帝汶", "parent": "3956"},
|
||||
"6916": {"name": "厄瓜多尔", "parent": "3956"},
|
||||
"6917": {"name": "埃及", "parent": "3956"},
|
||||
"6918": {"name": "萨尔瓦多", "parent": "3956"},
|
||||
"6919": {"name": "赤道几内亚", "parent": "3956"},
|
||||
"6920": {"name": "厄立特里亚", "parent": "3956"},
|
||||
"6921": {"name": "爱沙尼亚", "parent": "3956"},
|
||||
"6922": {"name": "埃塞俄比亚", "parent": "3956"},
|
||||
"6923": {"name": "福克兰群岛(马尔维纳斯)", "parent": "3956"},
|
||||
"6924": {"name": "法罗群岛", "parent": "3956"},
|
||||
"6925": {"name": "斐济", "parent": "3956"},
|
||||
"6926": {"name": "芬兰", "parent": "3956"},
|
||||
"6927": {"name": "法属圭亚那", "parent": "3956"},
|
||||
"6928": {"name": "法属波利尼西亚", "parent": "3956"},
|
||||
"6929": {"name": "法属南部领土", "parent": "3956"},
|
||||
"6930": {"name": "加蓬", "parent": "3956"},
|
||||
"6931": {"name": "冈比亚", "parent": "3956"},
|
||||
"6932": {"name": "格鲁吉亚", "parent": "3956"},
|
||||
"6933": {"name": "加纳", "parent": "3956"},
|
||||
"6934": {"name": "直布罗陀", "parent": "3956"},
|
||||
"6935": {"name": "希腊", "parent": "3956"},
|
||||
"6936": {"name": "格陵兰", "parent": "3956"},
|
||||
"6937": {"name": "格林纳达", "parent": "3956"},
|
||||
"6938": {"name": "瓜德罗普", "parent": "3956"},
|
||||
"6939": {"name": "关岛", "parent": "3956"},
|
||||
"6940": {"name": "危地马拉", "parent": "3956"},
|
||||
"6941": {"name": "几内亚", "parent": "3956"},
|
||||
"6942": {"name": "几内亚比绍", "parent": "3956"},
|
||||
"6943": {"name": "圭亚那", "parent": "3956"},
|
||||
"6944": {"name": "海地", "parent": "3956"},
|
||||
"6945": {"name": "赫德岛和麦克唐纳岛", "parent": "3956"},
|
||||
"6946": {"name": "洪都拉斯", "parent": "3956"},
|
||||
"6947": {"name": "匈牙利", "parent": "3956"},
|
||||
"6948": {"name": "冰岛", "parent": "3956"},
|
||||
"6949": {"name": "印度", "parent": "3956"},
|
||||
"6950": {"name": "印度尼西亚", "parent": "3956"},
|
||||
"6951": {"name": "伊朗", "parent": "3956"},
|
||||
"6952": {"name": "伊拉克", "parent": "3956"},
|
||||
"6953": {"name": "爱尔兰", "parent": "3956"},
|
||||
"6954": {"name": "以色列", "parent": "3956"},
|
||||
"6955": {"name": "牙买加", "parent": "3956"},
|
||||
"6956": {"name": "约旦", "parent": "3956"},
|
||||
"6957": {"name": "哈萨克斯坦", "parent": "3956"},
|
||||
"6958": {"name": "肯尼亚", "parent": "3956"},
|
||||
"6959": {"name": "基里巴斯", "parent": "3956"},
|
||||
"6960": {"name": "朝鲜", "parent": "3956"},
|
||||
"6961": {"name": "科威特", "parent": "3956"},
|
||||
"6962": {"name": "吉尔吉斯斯坦", "parent": "3956"},
|
||||
"6963": {"name": "老挝", "parent": "3956"},
|
||||
"6964": {"name": "拉脱维亚", "parent": "3956"},
|
||||
"6965": {"name": "黎巴嫩", "parent": "3956"},
|
||||
"6966": {"name": "莱索托", "parent": "3956"},
|
||||
"6967": {"name": "利比里亚", "parent": "3956"},
|
||||
"6968": {"name": "利比亚", "parent": "3956"},
|
||||
"6969": {"name": "列支敦士登", "parent": "3956"},
|
||||
"6970": {"name": "立陶宛", "parent": "3956"},
|
||||
"6971": {"name": "卢森堡", "parent": "3956"},
|
||||
"6972": {"name": "前南马其顿", "parent": "3956"},
|
||||
"6973": {"name": "马达加斯加", "parent": "3956"},
|
||||
"6974": {"name": "马拉维", "parent": "3956"},
|
||||
"6975": {"name": "马尔代夫", "parent": "3956"},
|
||||
"6976": {"name": "马里", "parent": "3956"},
|
||||
"6977": {"name": "马耳他", "parent": "3956"},
|
||||
"6978": {"name": "马绍尔群岛", "parent": "3956"},
|
||||
"6979": {"name": "马提尼克", "parent": "3956"},
|
||||
"6980": {"name": "毛里塔尼亚", "parent": "3956"},
|
||||
"6981": {"name": "毛里求斯", "parent": "3956"},
|
||||
"6982": {"name": "马约特", "parent": "3956"},
|
||||
"6983": {"name": "墨西哥", "parent": "3956"},
|
||||
"6984": {"name": "密克罗尼西亚联邦", "parent": "3956"},
|
||||
"6985": {"name": "摩尔多瓦", "parent": "3956"},
|
||||
"6986": {"name": "摩纳哥", "parent": "3956"},
|
||||
"6987": {"name": "蒙古", "parent": "3956"},
|
||||
"6988": {"name": "蒙特塞拉特", "parent": "3956"},
|
||||
"6989": {"name": "摩洛哥", "parent": "3956"},
|
||||
"6990": {"name": "莫桑比克", "parent": "3956"},
|
||||
"6991": {"name": "缅甸", "parent": "3956"},
|
||||
"6992": {"name": "纳米比亚", "parent": "3956"},
|
||||
"6993": {"name": "瑙鲁", "parent": "3956"},
|
||||
"6994": {"name": "尼泊尔", "parent": "3956"},
|
||||
"6995": {"name": "荷兰", "parent": "3956"},
|
||||
"6996": {"name": "荷属安的列斯", "parent": "3956"},
|
||||
"6997": {"name": "新喀里多尼亚", "parent": "3956"},
|
||||
"6998": {"name": "新西兰", "parent": "3956"},
|
||||
"6999": {"name": "尼加拉瓜", "parent": "3956"},
|
||||
"7000": {"name": "尼日尔", "parent": "3956"},
|
||||
"7001": {"name": "尼日利亚", "parent": "3956"},
|
||||
"7002": {"name": "纽埃", "parent": "3956"},
|
||||
"7003": {"name": "诺福克岛", "parent": "3956"},
|
||||
"7004": {"name": "北马里亚纳", "parent": "3956"},
|
||||
"7005": {"name": "挪威", "parent": "3956"},
|
||||
"7006": {"name": "阿曼", "parent": "3956"},
|
||||
"7007": {"name": "巴基斯坦", "parent": "3956"},
|
||||
"7008": {"name": "帕劳", "parent": "3956"},
|
||||
"7009": {"name": "巴勒斯坦", "parent": "3956"},
|
||||
"7010": {"name": "巴拿马", "parent": "3956"},
|
||||
"7011": {"name": "巴布亚新几内亚", "parent": "3956"},
|
||||
"7012": {"name": "巴拉圭", "parent": "3956"},
|
||||
"7013": {"name": "秘鲁", "parent": "3956"},
|
||||
"7014": {"name": "菲律宾", "parent": "3956"},
|
||||
"7015": {"name": "皮特凯恩群岛", "parent": "3956"},
|
||||
"7016": {"name": "波兰", "parent": "3956"},
|
||||
"7017": {"name": "葡萄牙", "parent": "3956"},
|
||||
"7018": {"name": "波多黎各", "parent": "3956"},
|
||||
"7019": {"name": "卡塔尔", "parent": "3956"},
|
||||
"7020": {"name": "留尼汪", "parent": "3956"},
|
||||
"7021": {"name": "罗马尼亚", "parent": "3956"},
|
||||
"7022": {"name": "俄罗斯联邦", "parent": "3956"},
|
||||
"7023": {"name": "卢旺达", "parent": "3956"},
|
||||
"7024": {"name": "圣赫勒拿", "parent": "3956"},
|
||||
"7025": {"name": "圣基茨和尼维斯", "parent": "3956"},
|
||||
"7026": {"name": "圣卢西亚", "parent": "3956"},
|
||||
"7027": {"name": "圣皮埃尔和密克隆", "parent": "3956"},
|
||||
"7028": {"name": "圣文森特和格林纳丁斯", "parent": "3956"},
|
||||
"7029": {"name": "萨摩亚", "parent": "3956"},
|
||||
"7030": {"name": "圣马力诺", "parent": "3956"},
|
||||
"7031": {"name": "圣多美和普林西比", "parent": "3956"},
|
||||
"7032": {"name": "沙特阿拉伯", "parent": "3956"},
|
||||
"7033": {"name": "塞内加尔", "parent": "3956"},
|
||||
"7034": {"name": "塞舌尔", "parent": "3956"},
|
||||
"7035": {"name": "塞拉利昂", "parent": "3956"},
|
||||
"7036": {"name": "新加坡", "parent": "3956"},
|
||||
"7037": {"name": "斯洛伐克", "parent": "3956"},
|
||||
"7038": {"name": "斯洛文尼亚", "parent": "3956"},
|
||||
"7039": {"name": "所罗门群岛", "parent": "3956"},
|
||||
"7040": {"name": "索马里", "parent": "3956"},
|
||||
"7041": {"name": "南非", "parent": "3956"},
|
||||
"7042": {"name": "南乔治亚岛和南桑德韦奇岛", "parent": "3956"},
|
||||
"7043": {"name": "斯里兰卡", "parent": "3956"},
|
||||
"7044": {"name": "苏丹", "parent": "3956"},
|
||||
"7045": {"name": "苏里南", "parent": "3956"},
|
||||
"7046": {"name": "斯瓦尔巴群岛", "parent": "3956"},
|
||||
"7047": {"name": "斯威士兰", "parent": "3956"},
|
||||
"7048": {"name": "瑞典", "parent": "3956"},
|
||||
"7049": {"name": "瑞士", "parent": "3956"},
|
||||
"7050": {"name": "叙利亚", "parent": "3956"},
|
||||
"7051": {"name": "塔吉克斯坦", "parent": "3956"},
|
||||
"7052": {"name": "坦桑尼亚", "parent": "3956"},
|
||||
"7053": {"name": "泰国", "parent": "3956"},
|
||||
"7054": {"name": "多哥", "parent": "3956"},
|
||||
"7055": {"name": "托克劳", "parent": "3956"},
|
||||
"7056": {"name": "汤加", "parent": "3956"},
|
||||
"7057": {"name": "特立尼达和多巴哥", "parent": "3956"},
|
||||
"7058": {"name": "突尼斯", "parent": "3956"},
|
||||
"7059": {"name": "土耳其", "parent": "3956"},
|
||||
"7060": {"name": "土库曼斯坦", "parent": "3956"},
|
||||
"7061": {"name": "特克斯科斯群岛", "parent": "3956"},
|
||||
"7062": {"name": "图瓦卢", "parent": "3956"},
|
||||
"7063": {"name": "乌干达", "parent": "3956"},
|
||||
"7064": {"name": "乌克兰", "parent": "3956"},
|
||||
"7065": {"name": "阿联酋", "parent": "3956"},
|
||||
"7066": {"name": "美国本土外小岛屿", "parent": "3956"},
|
||||
"7067": {"name": "乌拉圭", "parent": "3956"},
|
||||
"7068": {"name": "乌兹别克斯坦", "parent": "3956"},
|
||||
"7069": {"name": "瓦努阿图", "parent": "3956"},
|
||||
"7070": {"name": "梵蒂冈", "parent": "3956"},
|
||||
"7071": {"name": "委内瑞拉", "parent": "3956"},
|
||||
"7072": {"name": "越南", "parent": "3956"},
|
||||
"7073": {"name": "英属维尔京群岛", "parent": "3956"},
|
||||
"7074": {"name": "美属维尔京群岛", "parent": "3956"},
|
||||
"7075": {"name": "瓦利斯和富图纳", "parent": "3956"},
|
||||
"7076": {"name": "西撒哈拉", "parent": "3956"},
|
||||
"7077": {"name": "也门", "parent": "3956"},
|
||||
"7078": {"name": "南斯拉夫", "parent": "3956"},
|
||||
"7079": {"name": "赞比亚", "parent": "3956"},
|
||||
"7080": {"name": "津巴布韦", "parent": "3956"},
|
||||
"7081": {"name": "塞尔维亚", "parent": "3956"},
|
||||
"7082": {"name": "雄安新区", "parent": "4"},
|
||||
"7084": {"name": "天门市", "parent": "18"},
|
||||
}
|
||||
|
||||
NM_SET = set([v["name"] for _, v in TBL.items()])
|
||||
|
||||
|
||||
def get_names(id):
|
||||
if not id or str(id).lower() == "none":
|
||||
return []
|
||||
id = str(id)
|
||||
if not re.match("[0-9]+$", id.strip()):
|
||||
return [id]
|
||||
nms = []
|
||||
d = TBL.get(id)
|
||||
if not d:
|
||||
return []
|
||||
nms.append(d["name"])
|
||||
p = get_names(d["parent"])
|
||||
if p:
|
||||
nms.extend(p)
|
||||
return nms
|
||||
|
||||
|
||||
|
||||
def isName(nm):
|
||||
if nm in NM_SET:
|
||||
return True
|
||||
if nm + "市" in NM_SET:
|
||||
return True
|
||||
if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:
|
||||
return True
|
||||
return False
|
||||
65
deepdoc/parser/resume/entities/res/corp.tks.freq.json
Normal file
65
deepdoc/parser/resume/entities/res/corp.tks.freq.json
Normal file
@@ -0,0 +1,65 @@
|
||||
[
|
||||
"科技",
|
||||
"集团",
|
||||
"网络科技",
|
||||
"技术",
|
||||
"信息",
|
||||
"分公司",
|
||||
"信息技术",
|
||||
"发展",
|
||||
"科技股份",
|
||||
"网络",
|
||||
"贸易",
|
||||
"商贸",
|
||||
"工程",
|
||||
"企业",
|
||||
"集团股份",
|
||||
"商务",
|
||||
"工业",
|
||||
"控股集团",
|
||||
"国际贸易",
|
||||
"软件技术",
|
||||
"数码科技",
|
||||
"软件开发",
|
||||
"有限",
|
||||
"经营",
|
||||
"科技开发",
|
||||
"股份公司",
|
||||
"电子技术",
|
||||
"实业集团",
|
||||
"责任",
|
||||
"无限",
|
||||
"工程技术",
|
||||
"上市公司",
|
||||
"技术开发",
|
||||
"软件系统",
|
||||
"总公司",
|
||||
"网络服务",
|
||||
"ltd.",
|
||||
"technology",
|
||||
"company",
|
||||
"服务公司",
|
||||
"计算机技术",
|
||||
"计算机软件",
|
||||
"电子信息",
|
||||
"corporation",
|
||||
"计算机服务",
|
||||
"计算机系统",
|
||||
"有限公司",
|
||||
"事业部",
|
||||
"公司",
|
||||
"股份",
|
||||
"有限责任",
|
||||
"软件",
|
||||
"控股",
|
||||
"高科技",
|
||||
"房地产",
|
||||
"事业群",
|
||||
"部门",
|
||||
"电子商务",
|
||||
"人力资源顾问",
|
||||
"人力资源",
|
||||
"株式会社",
|
||||
"网络营销"
|
||||
]
|
||||
|
||||
31480
deepdoc/parser/resume/entities/res/corp_baike_len.csv
Normal file
31480
deepdoc/parser/resume/entities/res/corp_baike_len.csv
Normal file
File diff suppressed because it is too large
Load Diff
14939
deepdoc/parser/resume/entities/res/corp_tag.json
Normal file
14939
deepdoc/parser/resume/entities/res/corp_tag.json
Normal file
File diff suppressed because it is too large
Load Diff
911
deepdoc/parser/resume/entities/res/good_corp.json
Normal file
911
deepdoc/parser/resume/entities/res/good_corp.json
Normal file
@@ -0,0 +1,911 @@
|
||||
[
|
||||
"google assistant investments",
|
||||
"amazon",
|
||||
"dingtalk china information",
|
||||
"zhejiang alibaba communication",
|
||||
"yunos",
|
||||
"腾讯云",
|
||||
"新浪新闻",
|
||||
"网邻通",
|
||||
"蚂蚁集团",
|
||||
"大疆",
|
||||
"恒生股份",
|
||||
"sf express",
|
||||
"智者天下",
|
||||
"shanghai hema network",
|
||||
"papayamobile",
|
||||
"lexinfintech",
|
||||
"industrial consumer finance",
|
||||
"360搜索",
|
||||
"世纪光速",
|
||||
"迅雷区块链",
|
||||
"赛盒科技",
|
||||
"齐力电子商务",
|
||||
"平安养老险",
|
||||
"平安证券",
|
||||
"平安好贷",
|
||||
"五八新服",
|
||||
"呯嘭智能",
|
||||
"阿里妈妈",
|
||||
"mdt",
|
||||
"tencent",
|
||||
"weibo",
|
||||
"浪潮软件",
|
||||
"阿里巴巴广告",
|
||||
"mashang consumer finance",
|
||||
"维沃",
|
||||
"hqg , limited",
|
||||
"moodys",
|
||||
"搜狐支付",
|
||||
"百度秀",
|
||||
"新浪服务",
|
||||
"零售通",
|
||||
"同城艺龙",
|
||||
"虾米音乐",
|
||||
"贝壳集团",
|
||||
"小米有品",
|
||||
"滴滴自动驾驶",
|
||||
"图记",
|
||||
"阿里影业",
|
||||
"卓联软件",
|
||||
"zhejiang tmall",
|
||||
"谷歌中国",
|
||||
"hithink flush",
|
||||
"时装科技",
|
||||
"程会玩国际旅行社",
|
||||
"amazon china holding limited",
|
||||
"中信消金",
|
||||
"当当比特物流",
|
||||
"新浪新媒体咨询",
|
||||
"tongcheng network",
|
||||
"金山在线",
|
||||
"shopping cart",
|
||||
"犀互动",
|
||||
"五八",
|
||||
"bilibili",
|
||||
"阿里星球",
|
||||
"滴滴金科服务",
|
||||
"美团",
|
||||
"哈啰出行",
|
||||
"face",
|
||||
"平安健康",
|
||||
"招商银行",
|
||||
"连亚",
|
||||
"盒马网络",
|
||||
"b站",
|
||||
"华为机器",
|
||||
"shanghai mdt infotech",
|
||||
"ping an healthkonnect",
|
||||
"beijing home link real estate broker",
|
||||
"花海仓",
|
||||
"beijing jingdong shangke information",
|
||||
"微影智能",
|
||||
"酷狗游戏",
|
||||
"health.pingan.com",
|
||||
"众安",
|
||||
"陌陌",
|
||||
"海康威视数字",
|
||||
"同程网",
|
||||
"艾丁金融",
|
||||
"知乎",
|
||||
" lu",
|
||||
"国际商业机器公司",
|
||||
"捷信消费金融",
|
||||
"恒生利融",
|
||||
"china merchants bank",
|
||||
"企鹅电竞",
|
||||
"捷信信驰",
|
||||
"360智能家居",
|
||||
"小桔车服",
|
||||
"homecredit",
|
||||
"皮皮虾",
|
||||
"畅游",
|
||||
"聚爱聊",
|
||||
"suning.com",
|
||||
"途牛旅游网",
|
||||
"花呗",
|
||||
"盈店通",
|
||||
"sina",
|
||||
"阿里巴巴音乐",
|
||||
"华为技术有限公司",
|
||||
"国付宝",
|
||||
"shanghai lianshang network",
|
||||
"oppo",
|
||||
"华为投资控股",
|
||||
"beijing sohu new media information",
|
||||
"times square",
|
||||
"菜鸟物流",
|
||||
"lingxing",
|
||||
"jd digits",
|
||||
"同程旅游",
|
||||
"分期乐",
|
||||
"火锅视频",
|
||||
"天天快报",
|
||||
"猎豹移动",
|
||||
"五八人力资源",
|
||||
"宝宝树",
|
||||
"顺丰科技",
|
||||
"上海西翠",
|
||||
"诗程文化传播",
|
||||
"dewu",
|
||||
"领星网络",
|
||||
"aliexpress",
|
||||
"贝塔通科技",
|
||||
"链家",
|
||||
"花小猪",
|
||||
"趣输入",
|
||||
"搜狐新媒体",
|
||||
"一淘",
|
||||
"56",
|
||||
"qq阅读",
|
||||
"青桔单车",
|
||||
"iflytek",
|
||||
"每日优鲜电子商务",
|
||||
"腾讯觅影",
|
||||
"微医",
|
||||
"松果网",
|
||||
"paypal",
|
||||
"递瑞供应链管理",
|
||||
"领星",
|
||||
"qunar",
|
||||
"三快",
|
||||
"lu.com",
|
||||
"携程旅行网",
|
||||
"新潮传媒",
|
||||
"链家经纪",
|
||||
"景域文化",
|
||||
"阿里健康",
|
||||
"pingpeng",
|
||||
"聚划算",
|
||||
"零机科技",
|
||||
"街兔电单车",
|
||||
"快乐购",
|
||||
"华为数字能源",
|
||||
"搜狐",
|
||||
"陆家嘴国际金融资产交易市场",
|
||||
"nanjing tuniu",
|
||||
"亚马逊",
|
||||
"苏宁易购",
|
||||
"携程旅游",
|
||||
"苏宁金服",
|
||||
"babytree",
|
||||
"悟空问答",
|
||||
"同花顺",
|
||||
"eastmoney",
|
||||
"浪潮信息",
|
||||
"滴滴智慧交通",
|
||||
"beijing ruixun lingtong",
|
||||
"平安综合金融服务",
|
||||
"爱奇艺",
|
||||
"小米集团",
|
||||
"华为云",
|
||||
"微店",
|
||||
"恒生集团",
|
||||
"网易有道",
|
||||
"boccfc",
|
||||
"世纪思速科技",
|
||||
"海康消防",
|
||||
"beijing xiaomi",
|
||||
"众安科技",
|
||||
"五八同城",
|
||||
"霆程汽车租赁",
|
||||
"云卖分销",
|
||||
"乐信集团",
|
||||
"蚂蚁",
|
||||
"舶乐蜜电子商务",
|
||||
"支付宝中国",
|
||||
"砖块消消消",
|
||||
"vivo",
|
||||
"阿里互娱",
|
||||
"中国平安",
|
||||
"lingxihudong",
|
||||
"百度网盘",
|
||||
"1号店",
|
||||
"字节跳动",
|
||||
"京东科技",
|
||||
"驴妈妈兴旅国际旅行社",
|
||||
"hangzhou alibaba music",
|
||||
"xunlei",
|
||||
"灵犀互动娱乐",
|
||||
"快手",
|
||||
"youtube",
|
||||
"连尚慧眼",
|
||||
"腾讯体育",
|
||||
"爱商在线",
|
||||
"酷我音乐",
|
||||
"金融壹账通",
|
||||
"搜狗服务",
|
||||
"banma information",
|
||||
"a站",
|
||||
"罗汉堂",
|
||||
"薇仕网络",
|
||||
"搜狐新闻",
|
||||
"贝宝",
|
||||
"薇仕",
|
||||
"口袋时尚科技",
|
||||
"穆迪咨询",
|
||||
"新狐投资管理",
|
||||
"hikvision",
|
||||
"alimama china holding limited",
|
||||
"超聚变数字",
|
||||
"腾讯视频",
|
||||
"恒生电子",
|
||||
"百度游戏",
|
||||
"绿洲",
|
||||
"木瓜移动",
|
||||
"红袖添香",
|
||||
"店匠科技",
|
||||
"易贝",
|
||||
"一淘网",
|
||||
"博览群书",
|
||||
"唯品会",
|
||||
"lazglobal",
|
||||
"amap",
|
||||
"芒果网",
|
||||
"口碑",
|
||||
"海康慧影",
|
||||
"腾讯音乐娱乐",
|
||||
"网易严选",
|
||||
"微信",
|
||||
"shenzhen lexin holding",
|
||||
"hangzhou pingpeng intelligent",
|
||||
"连尚网络",
|
||||
"海思",
|
||||
"isunor",
|
||||
"蝉翼",
|
||||
"阿里游戏",
|
||||
"广州优视",
|
||||
"优视",
|
||||
"腾讯征信",
|
||||
"识装",
|
||||
"finserve.pingan.com",
|
||||
"papaya",
|
||||
"阅文",
|
||||
"平安健康保险",
|
||||
"考拉海购",
|
||||
"网易印象",
|
||||
"wifi万能钥匙",
|
||||
"新浪互联服务",
|
||||
"亚马逊云科技",
|
||||
"迅雷看看",
|
||||
"华为朗新科技",
|
||||
"adyen hong kong limited",
|
||||
"谷歌",
|
||||
"得物",
|
||||
"网心",
|
||||
"cainiao network",
|
||||
"沐瞳",
|
||||
"linkedln",
|
||||
"hundsun",
|
||||
"阿里旅行",
|
||||
"珍爱网",
|
||||
"阿里巴巴通信",
|
||||
"金山奇剑",
|
||||
"tongtool",
|
||||
"华为安捷信电气",
|
||||
"快乐时代",
|
||||
"平安寿险",
|
||||
"微博",
|
||||
"微跳蚤",
|
||||
"oppo移动通信",
|
||||
"毒",
|
||||
"alimama",
|
||||
"shoplazza",
|
||||
"shenzhen dianjiang science and",
|
||||
"众鸣世科",
|
||||
"平安金融",
|
||||
"狐友",
|
||||
"维沃移动通信",
|
||||
"tobosoft",
|
||||
"齐力电商",
|
||||
"ali",
|
||||
"诚信通",
|
||||
"行吟",
|
||||
"跳舞的线",
|
||||
"橙心优选",
|
||||
"众安健康",
|
||||
"亚马逊中国投资",
|
||||
"德絮投资管理中心合伙",
|
||||
"招联消费金融",
|
||||
"百度文学",
|
||||
"芝麻信用",
|
||||
"阿里零售通",
|
||||
"时装",
|
||||
"花样直播",
|
||||
"sogou",
|
||||
"uc",
|
||||
"海思半导体",
|
||||
"zhongan online p&c insurance",
|
||||
"新浪数字",
|
||||
"驴妈妈旅游网",
|
||||
"华为数字能源技术",
|
||||
"京东数科",
|
||||
"oracle",
|
||||
"xiaomi",
|
||||
"nyse",
|
||||
"阳光消费金融",
|
||||
"天天动听",
|
||||
"大众点评",
|
||||
"上海瑞家",
|
||||
"trustpass",
|
||||
"hundsun technologies",
|
||||
"美团小贷",
|
||||
"ebay",
|
||||
"通途",
|
||||
"tcl",
|
||||
"鸿蒙",
|
||||
"酷狗计算机",
|
||||
"品诺保险",
|
||||
"capitalg",
|
||||
"康盛创想",
|
||||
"58同城",
|
||||
"闲鱼",
|
||||
"微软",
|
||||
"吉易付科技",
|
||||
"理财通",
|
||||
"ctrip",
|
||||
"yy",
|
||||
"华为数字",
|
||||
"kingsoft",
|
||||
"孙宁金融",
|
||||
"房江湖经纪",
|
||||
"youku",
|
||||
"ant financial services group",
|
||||
"盒马",
|
||||
"sensetime",
|
||||
"伊千网络",
|
||||
"小豹ai翻译棒",
|
||||
"shopify",
|
||||
"前海微众银行",
|
||||
"qd",
|
||||
"gmail",
|
||||
"pingpong",
|
||||
"alibaba group holding limited",
|
||||
"捷信时空电子商务",
|
||||
"orientsec",
|
||||
"乔戈里管理咨询",
|
||||
"ant",
|
||||
"锐讯灵通",
|
||||
"兴业消费金融",
|
||||
"京东叁佰陆拾度电子商务",
|
||||
"新浪",
|
||||
"优酷土豆",
|
||||
"海康机器人",
|
||||
"美团单车",
|
||||
"海康存储",
|
||||
"领英",
|
||||
"阿里全球速卖通",
|
||||
"美菜网",
|
||||
"京邦达",
|
||||
"安居客",
|
||||
"阿里体育",
|
||||
"相互宝",
|
||||
"cloudwalk",
|
||||
"百度智能云",
|
||||
"贝壳",
|
||||
"酷狗",
|
||||
"sunshine consumer finance",
|
||||
"掌宜",
|
||||
"奇酷网",
|
||||
"核新同花顺",
|
||||
"阿里巴巴影业",
|
||||
"节创",
|
||||
"学而思网校",
|
||||
"速途",
|
||||
"途牛",
|
||||
"阿里云计算",
|
||||
"beijing sensetime",
|
||||
"alibaba cloud",
|
||||
"西瓜视频",
|
||||
"美团优选",
|
||||
"orient securities limited",
|
||||
"华为朗新",
|
||||
"店匠",
|
||||
"shanghai weishi network",
|
||||
"友盟",
|
||||
"飞猪旅行",
|
||||
"滴滴出行",
|
||||
"alipay",
|
||||
"mogu",
|
||||
"dangdang",
|
||||
"大麦网",
|
||||
"汉军智能系统",
|
||||
"百度地图",
|
||||
"货车帮",
|
||||
"狐狸金服",
|
||||
"众安在线保险经纪",
|
||||
"华为通信",
|
||||
"新浪支付",
|
||||
"zhihu",
|
||||
"alibaba cloud computing",
|
||||
"沙发视频",
|
||||
"金山软件",
|
||||
"ping an good doctor",
|
||||
"携程",
|
||||
"脉脉",
|
||||
"youku information beijing",
|
||||
"zhongan",
|
||||
"艾丁软件",
|
||||
"乒乓智能",
|
||||
"蘑菇街",
|
||||
"taobao",
|
||||
"华为技术服务",
|
||||
"仕承文化传播",
|
||||
"安捷信",
|
||||
"狐狸互联网小额贷款",
|
||||
"节点迅捷",
|
||||
"中国银行",
|
||||
"搜镇",
|
||||
"众安在线",
|
||||
"dingtalk",
|
||||
"云从科技",
|
||||
"beijing jingbangda trade",
|
||||
"moody s",
|
||||
"滚动的天空",
|
||||
"yl.pingan.com",
|
||||
"奇虎",
|
||||
"alihealth",
|
||||
"芒果tv",
|
||||
"lufax",
|
||||
"美团打车",
|
||||
"小桔",
|
||||
"贝壳找房网",
|
||||
"小米科技",
|
||||
"vips",
|
||||
"kindle",
|
||||
"亚马逊服务",
|
||||
"citic consumer finance",
|
||||
"微众",
|
||||
"搜狗智慧互联网医院",
|
||||
"盒马鲜生",
|
||||
"life.pinan.com",
|
||||
"ph.com.cn",
|
||||
"银联",
|
||||
"cmbchina",
|
||||
"平安金融科技咨询",
|
||||
"微保",
|
||||
"甲骨文中国",
|
||||
"飞书",
|
||||
"koubei shanghai information",
|
||||
"企鹅辅导",
|
||||
"斑马",
|
||||
"平安租赁",
|
||||
"云从",
|
||||
"马上消费",
|
||||
"hangzhou ali baba advertising",
|
||||
"金山",
|
||||
"赛盒",
|
||||
"科大讯飞",
|
||||
"金星创业投资",
|
||||
"平安国际融资租赁",
|
||||
"360你财富",
|
||||
"西山居",
|
||||
"shenzhen qianhai fourth paradigm data",
|
||||
"海思光电子",
|
||||
"猎户星空",
|
||||
"网易公司",
|
||||
"浪潮",
|
||||
"粒粒橙传媒",
|
||||
"招联金融",
|
||||
"100. me",
|
||||
"捷信信驰咨询",
|
||||
"唯品仓",
|
||||
"orient",
|
||||
"趣拿",
|
||||
"摩拜单车",
|
||||
"天猫精灵",
|
||||
"菜鸟",
|
||||
"豹小贩",
|
||||
"去哪儿",
|
||||
"米家",
|
||||
"哈啰单车",
|
||||
"搜狐体育",
|
||||
"shopify payments usa",
|
||||
"高德软件",
|
||||
"讯联智付",
|
||||
"乐信",
|
||||
"唯你搭",
|
||||
"第四范式",
|
||||
"菜鸟网络",
|
||||
"同程",
|
||||
"yy语音",
|
||||
"浪潮云",
|
||||
"东财",
|
||||
"淘宝",
|
||||
"寻梦",
|
||||
"citic securities limited",
|
||||
"青橙之旅",
|
||||
"阿里巴巴",
|
||||
"番茄小说",
|
||||
"上海亿贝",
|
||||
"inspur",
|
||||
"babytree inc",
|
||||
"海康智慧产业股权投资基金合伙合伙",
|
||||
"adyen",
|
||||
"艺龙",
|
||||
"蚂蚁金服",
|
||||
"平安金服",
|
||||
"百度百科",
|
||||
"unionpay",
|
||||
"当当",
|
||||
"阅文集团",
|
||||
"东方财富",
|
||||
"东方证券",
|
||||
"哈罗单车",
|
||||
"优酷",
|
||||
"海康",
|
||||
"alipay china network",
|
||||
"网商银行",
|
||||
"钧正",
|
||||
"property.pingan.com",
|
||||
"豹咖啡",
|
||||
"网易",
|
||||
"我爱cba",
|
||||
"theduapp",
|
||||
"360",
|
||||
"金山数字娱乐",
|
||||
"新浪阅读",
|
||||
"alibabagames",
|
||||
"顺丰",
|
||||
"支点商贸",
|
||||
"同程旅行",
|
||||
"citic securities",
|
||||
"ele.com",
|
||||
"tal",
|
||||
"fresh hema",
|
||||
"运满满",
|
||||
"贝壳网",
|
||||
"酷狗音乐",
|
||||
"鲜城",
|
||||
"360健康",
|
||||
"浪潮世科",
|
||||
"迅雷网络",
|
||||
"哔哩哔哩",
|
||||
"华为电动",
|
||||
"淘友天下",
|
||||
"华多网络",
|
||||
"xunlei networking technologies",
|
||||
"云杉",
|
||||
"当当网电子商务",
|
||||
"津虹网络",
|
||||
"wedoc cloud hangzhou holdings",
|
||||
"alisports shanghai",
|
||||
"旷视金智",
|
||||
"钉钉中国",
|
||||
"微影",
|
||||
"金山快快",
|
||||
"亿贝",
|
||||
"wedoc",
|
||||
"autonavi",
|
||||
"哈啰助力车",
|
||||
"google cloud",
|
||||
"新浪乐居",
|
||||
"京东股票",
|
||||
"搜狗智慧远程医疗中心",
|
||||
"中银消金",
|
||||
"merchants union consumer finance",
|
||||
"王者荣耀",
|
||||
"百度手机",
|
||||
"美团民宿",
|
||||
"kaola",
|
||||
"小屋",
|
||||
"金山网络",
|
||||
"来往",
|
||||
"顺丰速运",
|
||||
"腾讯课堂",
|
||||
"百度在线网络",
|
||||
"美团买菜",
|
||||
"威视汽车",
|
||||
"uc mobile",
|
||||
"来赞达",
|
||||
"平安健康医疗",
|
||||
"豹小秘",
|
||||
"尚网",
|
||||
"哈勃投资",
|
||||
" ping an insurance group of china ,",
|
||||
"小米",
|
||||
"360好药",
|
||||
"qq音乐",
|
||||
"lingxigames",
|
||||
"faceu激萌",
|
||||
"搜狗",
|
||||
"sohu",
|
||||
"满帮",
|
||||
"vipshop",
|
||||
"wishpost",
|
||||
"金山世游",
|
||||
"shanghai yibaimi network",
|
||||
"1688",
|
||||
"海康汽车",
|
||||
"顺丰控股",
|
||||
"华为",
|
||||
"妙镜vr",
|
||||
"paybkj.com",
|
||||
"hellobike",
|
||||
"豹来电",
|
||||
"京东",
|
||||
"驴妈妈",
|
||||
"momo",
|
||||
"平安健康险",
|
||||
"哈勃科技",
|
||||
"美菜",
|
||||
"众安在线财产保险",
|
||||
"海康威视",
|
||||
"east money information",
|
||||
"阿里云",
|
||||
"蝉游记",
|
||||
"余额宝",
|
||||
"屋客",
|
||||
"滴滴",
|
||||
"shopify international limited",
|
||||
"百度",
|
||||
"阿里健康中国",
|
||||
"阿里通信",
|
||||
"微梦创科",
|
||||
"微医云",
|
||||
"轻颜相机",
|
||||
"搜易居",
|
||||
"趣店集团",
|
||||
"美团云",
|
||||
"ant group",
|
||||
"金山云",
|
||||
"beijing express hand",
|
||||
"觅觅",
|
||||
"支付宝",
|
||||
"滴滴承信科技咨询服务",
|
||||
"拼多多",
|
||||
"众安运动",
|
||||
"乞力电商",
|
||||
"youcash",
|
||||
"唯品金融",
|
||||
"陆金所",
|
||||
"本地生活",
|
||||
"sz dji",
|
||||
"海康智能",
|
||||
"魔方网聘",
|
||||
"青藤大学",
|
||||
"international business machines",
|
||||
"学而思",
|
||||
"beijing zhongming century science and",
|
||||
"猎豹清理大师",
|
||||
"asinking",
|
||||
"高德",
|
||||
"苏宁",
|
||||
"优酷网",
|
||||
"艾丁",
|
||||
"中银消费金融",
|
||||
"京东健康",
|
||||
"五八教育",
|
||||
"pingpongx",
|
||||
"搜狐时尚",
|
||||
"阿里广告",
|
||||
"平安财险",
|
||||
"中邮消金",
|
||||
"etao",
|
||||
"怕怕",
|
||||
"nyse:cmcm",
|
||||
"华为培训中心",
|
||||
"高德地图",
|
||||
"云狐天下征信",
|
||||
"大疆创新",
|
||||
"连尚",
|
||||
"壹佰米",
|
||||
"康健公司",
|
||||
"iqiyi.com",
|
||||
"360安全云盘",
|
||||
"馒头直播",
|
||||
"淘友网",
|
||||
"东方赢家",
|
||||
"bank of china",
|
||||
"微众银行",
|
||||
"阿里巴巴国际站",
|
||||
"虾米",
|
||||
"去哪儿网",
|
||||
"ctrip travel network shanghai",
|
||||
"潇湘书院",
|
||||
"腾讯",
|
||||
"快乐阳光互动娱乐传媒",
|
||||
"迅雷",
|
||||
"weidian",
|
||||
"滴滴货运",
|
||||
"ping an puhui enterprise management",
|
||||
"新浪仓石基金销售",
|
||||
"搜狐焦点",
|
||||
"alibaba pictures",
|
||||
"wps",
|
||||
"平安",
|
||||
"lazmall",
|
||||
"百度开放平台",
|
||||
"兴业消金",
|
||||
" 珍爱网",
|
||||
"京东云",
|
||||
"小红书",
|
||||
"1688. com",
|
||||
"如视智数",
|
||||
"missfresh",
|
||||
"pazl.pingan.cn",
|
||||
"平安集团",
|
||||
"kugou",
|
||||
"懂车帝",
|
||||
"斑马智行",
|
||||
"浪潮集团",
|
||||
"netease hangzhou network",
|
||||
"pagd.net",
|
||||
"探探",
|
||||
"chinaliterature",
|
||||
"amazon亚马逊",
|
||||
"alphabet",
|
||||
"当当文创手工艺品电子商务",
|
||||
"五八邦",
|
||||
"shenzhen zhenai network information",
|
||||
"lingshoutong",
|
||||
"字节",
|
||||
"lvmama",
|
||||
"金山办公",
|
||||
"众安保险",
|
||||
"时装信息",
|
||||
"优视科技",
|
||||
"guangzhou kugou",
|
||||
"ibm",
|
||||
"滴滴打车",
|
||||
"beijing sogou information service",
|
||||
"megvii",
|
||||
"健谈哥",
|
||||
"cloudwalk group",
|
||||
"蜂联科技",
|
||||
"冬云",
|
||||
"京东尚科",
|
||||
"钢琴块2",
|
||||
"京东世纪",
|
||||
"商汤",
|
||||
"众鸣世纪",
|
||||
"腾讯音乐",
|
||||
"迅雷网文化",
|
||||
"华为云计算技术",
|
||||
"live.me",
|
||||
"全球速卖通",
|
||||
"快的打车",
|
||||
"hello group inc",
|
||||
"美丽说",
|
||||
"suning",
|
||||
"opengauss",
|
||||
"lazada",
|
||||
"tmall",
|
||||
"acfun",
|
||||
"当当网",
|
||||
"中银",
|
||||
"旷视科技",
|
||||
"百度钱包",
|
||||
"淘宝网",
|
||||
"新浪微博",
|
||||
"迅雷集团",
|
||||
"中信消费金融",
|
||||
"学而思教育",
|
||||
"平安普惠",
|
||||
"悟空跨境",
|
||||
"irobotbox",
|
||||
"平安产险",
|
||||
"inspur group",
|
||||
"世纪卓越快递服务",
|
||||
"奇虎360",
|
||||
"webank",
|
||||
"偶藻",
|
||||
"唯品支付",
|
||||
"腾讯云计算",
|
||||
"众安服务",
|
||||
"亿之唐",
|
||||
"beijing 58 information ttechnology",
|
||||
"平安好医生",
|
||||
"迅雷之锤",
|
||||
"旅行小账本",
|
||||
"芒果游戏",
|
||||
"新浪传媒",
|
||||
"旷镜博煊",
|
||||
"全民k歌",
|
||||
"滴滴支付",
|
||||
"北京网心科技",
|
||||
"挂号网",
|
||||
"萤石",
|
||||
"chinavision media group limited",
|
||||
"猎豹安全大师",
|
||||
"cmcm",
|
||||
"趣店",
|
||||
"蚂蚁财富",
|
||||
"商汤科技",
|
||||
"甲骨文",
|
||||
"百度云",
|
||||
"百度apollo",
|
||||
"19 pay",
|
||||
"stock.pingan.com",
|
||||
"tiktok",
|
||||
"alibaba pictures group limited",
|
||||
"ele",
|
||||
"考拉",
|
||||
"天猫",
|
||||
"腾讯优图",
|
||||
"起点中文网",
|
||||
"百度视频",
|
||||
"shanghai bili bili",
|
||||
"京东物流",
|
||||
"ebay marketplaces gmbh",
|
||||
"alibaba sport",
|
||||
"wish",
|
||||
"阿里巴巴中国",
|
||||
"中国银联",
|
||||
"alibaba china network",
|
||||
"china ping an property insurance",
|
||||
"百度糯米网",
|
||||
"微软中国",
|
||||
"一九付",
|
||||
"4 paradigm",
|
||||
"叮咚买菜",
|
||||
"umeng",
|
||||
"众鸣科技",
|
||||
"平安财富通",
|
||||
"google",
|
||||
"巨量引擎",
|
||||
"百度贴吧",
|
||||
"beijing jingdong century information",
|
||||
"讯飞",
|
||||
"beijing yunshan information",
|
||||
"满运软件",
|
||||
"中邮消费金融",
|
||||
"饿了么",
|
||||
"alios",
|
||||
"腾讯ai实验室",
|
||||
"第四范式智能",
|
||||
"瀚星创业投资",
|
||||
"gradient ventures",
|
||||
"microsoft",
|
||||
"哈啰共享汽车",
|
||||
"乞力电子商务",
|
||||
"mscf",
|
||||
"网易影业文化",
|
||||
"铁友旅游咨询",
|
||||
"kilimall",
|
||||
"云企互联投资",
|
||||
"ping an financial consulting",
|
||||
"beijng jingdong century commerce",
|
||||
"高德威智能交通系统",
|
||||
"中友信息",
|
||||
"平安医疗健康管理",
|
||||
"eciticcfc",
|
||||
"中信证券",
|
||||
"fliggy",
|
||||
"电子湾",
|
||||
"旷云金智",
|
||||
"微粒贷",
|
||||
"rsi",
|
||||
"滴滴云计算",
|
||||
"google ventures",
|
||||
"箐程",
|
||||
"每日优鲜",
|
||||
"音兔",
|
||||
"拉扎斯",
|
||||
"今日头条",
|
||||
"乐信控股",
|
||||
"猎豹浏览器",
|
||||
"细微咨询",
|
||||
"好未来",
|
||||
"我乐",
|
||||
"绘声绘色",
|
||||
"抖音",
|
||||
"搜狐新时代",
|
||||
"飞猪",
|
||||
"鹅厂",
|
||||
"贝壳找房",
|
||||
"tuniu",
|
||||
"红马传媒文化",
|
||||
"钉钉",
|
||||
"马上消费金融",
|
||||
"360手机",
|
||||
"平安医保",
|
||||
"快途",
|
||||
"alibaba",
|
||||
"小哈换电",
|
||||
"大麦",
|
||||
"恒睿人工智能研究院",
|
||||
"谷歌资本",
|
||||
"猎豹",
|
||||
"穆迪信息"
|
||||
]
|
||||
595
deepdoc/parser/resume/entities/res/good_sch.json
Normal file
595
deepdoc/parser/resume/entities/res/good_sch.json
Normal file
@@ -0,0 +1,595 @@
|
||||
[
|
||||
"中国科技大学",
|
||||
"国防科学技术大学",
|
||||
"清华大学",
|
||||
"清华",
|
||||
"tsinghua university",
|
||||
"thu",
|
||||
"北京大学",
|
||||
"北大",
|
||||
"beijing university",
|
||||
"pku",
|
||||
"中国科学技术大学",
|
||||
"中国科大",
|
||||
"中科大",
|
||||
"china science & technology university",
|
||||
"ustc",
|
||||
"复旦大学",
|
||||
"复旦",
|
||||
"fudan university",
|
||||
"fdu",
|
||||
"中国人民大学",
|
||||
"人大",
|
||||
"人民大学",
|
||||
"renmin university of china",
|
||||
"ruc",
|
||||
"上海交通大学",
|
||||
"上海交大",
|
||||
"shanghai jiao tong university",
|
||||
"sjtu",
|
||||
"南京大学",
|
||||
"南大",
|
||||
"nanjing university",
|
||||
"nju",
|
||||
"同济大学",
|
||||
"同济",
|
||||
"tongji university",
|
||||
"tongji",
|
||||
"浙江大学",
|
||||
"浙大",
|
||||
"zhejiang university",
|
||||
"zju",
|
||||
"南开大学",
|
||||
"南开",
|
||||
"nankai university",
|
||||
"nku",
|
||||
"北京航空航天大学",
|
||||
"北航",
|
||||
"beihang university",
|
||||
"buaa",
|
||||
"北京师范大学",
|
||||
"北师",
|
||||
"北师大",
|
||||
"beijing normal university",
|
||||
"bnu",
|
||||
"武汉大学",
|
||||
"武大",
|
||||
"wuhan university",
|
||||
"whu",
|
||||
"西安交通大学",
|
||||
"西安交大",
|
||||
"xi’an jiaotong university",
|
||||
"xjtu",
|
||||
"天津大学",
|
||||
"天大",
|
||||
"university of tianjin",
|
||||
"tju",
|
||||
"华中科技大学",
|
||||
"华中大",
|
||||
"central china university science and technology",
|
||||
"hust",
|
||||
"北京理工大学",
|
||||
"北理",
|
||||
"beijing institute of technology",
|
||||
"bit",
|
||||
"东南大学",
|
||||
"东大",
|
||||
"southeast china university",
|
||||
"seu",
|
||||
"中山大学",
|
||||
"中大",
|
||||
"zhongshan university",
|
||||
"sysu",
|
||||
"华东师范大学",
|
||||
"华师大",
|
||||
"east china normal university",
|
||||
"ecnu",
|
||||
"哈尔滨工业大学",
|
||||
"哈工大",
|
||||
"harbin institute of technology",
|
||||
"hit",
|
||||
"厦门大学",
|
||||
"厦大",
|
||||
"xiamen university",
|
||||
"xmu",
|
||||
"西北工业大学",
|
||||
"西工大",
|
||||
"西北工大",
|
||||
"northwestern polytechnical university",
|
||||
"npu",
|
||||
"中南大学",
|
||||
"中南",
|
||||
"middle and southern university",
|
||||
"csu",
|
||||
"大连理工大学",
|
||||
"大工",
|
||||
"institute of technology of dalian",
|
||||
"dut",
|
||||
"四川大学",
|
||||
"川大",
|
||||
"sichuan university",
|
||||
"scu",
|
||||
"电子科技大学",
|
||||
"电子科大",
|
||||
"university of electronic science and technology of china",
|
||||
"uestc",
|
||||
"华南理工大学",
|
||||
"华南理工",
|
||||
"institutes of technology of south china",
|
||||
"scut",
|
||||
"吉林大学",
|
||||
"吉大",
|
||||
"jilin university",
|
||||
"jlu",
|
||||
"湖南大学",
|
||||
"湖大",
|
||||
"hunan university",
|
||||
"hnu",
|
||||
"重庆大学",
|
||||
"重大",
|
||||
"university of chongqing",
|
||||
"cqu",
|
||||
"山东大学",
|
||||
"山大",
|
||||
"shandong university",
|
||||
"sdu",
|
||||
"中国农业大学",
|
||||
"中国农大",
|
||||
"china agricultural university",
|
||||
"cau",
|
||||
"中国海洋大学",
|
||||
"中国海大",
|
||||
"chinese marine university",
|
||||
"ouc",
|
||||
"中央民族大学",
|
||||
"中央民大",
|
||||
"central university for nationalities",
|
||||
"muc",
|
||||
"东北大学",
|
||||
"东北工学院",
|
||||
"northeastern university",
|
||||
"neu 或 nu",
|
||||
"兰州大学",
|
||||
"兰大",
|
||||
"lanzhou university",
|
||||
"lzu",
|
||||
"西北农林科技大学",
|
||||
"西农","西北农大",
|
||||
"northwest a&f university",
|
||||
"nwafu",
|
||||
"中国人民解放军国防科技大学",
|
||||
"国防科技大学","国防科大",
|
||||
"national university of defense technology",
|
||||
"nudt",
|
||||
"郑州大学",
|
||||
"郑大",
|
||||
"zhengzhou university",
|
||||
"zzu",
|
||||
"云南大学",
|
||||
"云大",
|
||||
"yunnan university",
|
||||
"ynu",
|
||||
"新疆大学",
|
||||
"新大",
|
||||
"xinjiang university",
|
||||
"xju",
|
||||
"北京交通大学",
|
||||
"北京交大",
|
||||
"beijing jiaotong university",
|
||||
"bjtu",
|
||||
"北京工业大学",
|
||||
"北工大",
|
||||
"beijing university of technology",
|
||||
"bjut",
|
||||
"北京科技大学",
|
||||
"北科大","北京科大",
|
||||
"university of science and technology beijing",
|
||||
"ustb",
|
||||
"北京化工大学",
|
||||
"北化",
|
||||
"beijing university of chemical technology",
|
||||
"buct",
|
||||
"北京邮电大学",
|
||||
"北邮",
|
||||
"beijing university of posts and telecommunications",
|
||||
"beijing university of post and telecommunications",
|
||||
"beijing university of post and telecommunication",
|
||||
"beijing university of posts and telecommunication",
|
||||
"bupt",
|
||||
"北京林业大学",
|
||||
"北林",
|
||||
"beijing forestry university",
|
||||
"bfu",
|
||||
"北京协和医学院",
|
||||
"协和医学院",
|
||||
"peking union medical college",
|
||||
"pumc",
|
||||
"北京中医药大学",
|
||||
"北中医",
|
||||
"beijing university of chinese medicine",
|
||||
"bucm",
|
||||
"首都师范大学",
|
||||
"首师大",
|
||||
"capital normal university",
|
||||
"cnu",
|
||||
"北京外国语大学",
|
||||
"北外",
|
||||
"beijing foreign studies university",
|
||||
"bfsu",
|
||||
"中国传媒大学",
|
||||
"中媒",
|
||||
"中传",
|
||||
"北京广播学院",
|
||||
"communication university of china",
|
||||
"cuc",
|
||||
"中央财经大学",
|
||||
"中央财大",
|
||||
"中财大",
|
||||
"the central university of finance and economics",
|
||||
"cufe",
|
||||
"对外经济贸易大学",
|
||||
"对外经贸大学",
|
||||
"贸大",
|
||||
"university of international business and economics",
|
||||
"uibe",
|
||||
"外交学院",
|
||||
"外院",
|
||||
"china foreign affairs university",
|
||||
"cfau",
|
||||
"中国人民公安大学",
|
||||
"公安大学",
|
||||
"people's public security university of china",
|
||||
"ppsuc",
|
||||
"北京体育大学",
|
||||
"北体大",
|
||||
"beijing sport university",
|
||||
"bsu",
|
||||
"中央音乐学院",
|
||||
"央音",
|
||||
"中央院",
|
||||
"central conservatory of music",
|
||||
"ccom",
|
||||
"中国音乐学院",
|
||||
"国音",
|
||||
"中国院",
|
||||
"china conservatory of music",
|
||||
"ccmusic",
|
||||
"中央美术学院",
|
||||
"央美",
|
||||
"central academy of fine art",
|
||||
"cafa",
|
||||
"中央戏剧学院",
|
||||
"中戏",
|
||||
"the central academy of drama",
|
||||
"tcad",
|
||||
"中国政法大学",
|
||||
"法大",
|
||||
"china university of political science and law",
|
||||
"zuc",
|
||||
"cupl",
|
||||
"中国科学院大学",
|
||||
"国科大",
|
||||
"科院大",
|
||||
"university of chinese academy of sciences",
|
||||
"ucas",
|
||||
"福州大学",
|
||||
"福大",
|
||||
"university of fuzhou",
|
||||
"fzu",
|
||||
"暨南大学",
|
||||
"暨大",
|
||||
"ji'nan university",
|
||||
"jnu",
|
||||
"广州中医药大学",
|
||||
"广中医",
|
||||
"traditional chinese medicine university of guangzhou",
|
||||
"gucm",
|
||||
"华南师范大学",
|
||||
"华南师大",
|
||||
"south china normal university",
|
||||
"scnu",
|
||||
"广西大学",
|
||||
"西大",
|
||||
"guangxi university",
|
||||
"gxu",
|
||||
"贵州大学",
|
||||
"贵大",
|
||||
"guizhou university",
|
||||
"gzu",
|
||||
"海南大学",
|
||||
"海大",
|
||||
"university of hainan",
|
||||
"hainu",
|
||||
"河南大学",
|
||||
"河大",
|
||||
"he'nan university",
|
||||
"henu",
|
||||
"哈尔滨工程大学",
|
||||
"哈工程",
|
||||
"harbin engineering university",
|
||||
"heu",
|
||||
"东北农业大学",
|
||||
"东北农大",
|
||||
"northeast agricultural university",
|
||||
"neau",
|
||||
"东北林业大学",
|
||||
"东北林大",
|
||||
"northeast forestry university",
|
||||
"nefu",
|
||||
"中国地质大学",
|
||||
"地大",
|
||||
"china university of geosciences",
|
||||
"cug",
|
||||
"武汉理工大学",
|
||||
"武汉理工",
|
||||
"wuhan university of technology",
|
||||
"wut",
|
||||
"华中农业大学",
|
||||
"华中农大",
|
||||
"华农",
|
||||
"central china agricultural university",
|
||||
"hzau",
|
||||
"华中师范大学",
|
||||
"华中师大",
|
||||
"华大",
|
||||
"central china normal university",
|
||||
"ccnu",
|
||||
"中南财经政法大学",
|
||||
"中南大",
|
||||
"zhongnan university of economics & law",
|
||||
"zuel",
|
||||
"湖南师范大学",
|
||||
"湖南师大",
|
||||
"hunan normal university",
|
||||
"hunnu",
|
||||
"延边大学",
|
||||
"延大",
|
||||
"yanbian university",
|
||||
"ybu",
|
||||
"东北师范大学",
|
||||
"东北师大",
|
||||
"northeast normal university",
|
||||
"nenu",
|
||||
"苏州大学",
|
||||
"苏大",
|
||||
"soochow university",
|
||||
"suda",
|
||||
"南京航空航天大学",
|
||||
"南航",
|
||||
"nanjing aero-space university",
|
||||
"nuaa",
|
||||
"南京理工大学",
|
||||
"南理工",
|
||||
"institutes of technology of nanjing",
|
||||
"njust",
|
||||
"中国矿业大学",
|
||||
"中国矿大",
|
||||
"china mining university",
|
||||
"cumt",
|
||||
"南京邮电大学",
|
||||
"南邮",
|
||||
"nanjing university of posts and telecommunications",
|
||||
"njupt",
|
||||
"河海大学",
|
||||
"河海",
|
||||
"river sea university",
|
||||
"hhu",
|
||||
"江南大学",
|
||||
"江南大",
|
||||
"jiangnan university",
|
||||
"jiangnan",
|
||||
"南京林业大学",
|
||||
"南林",
|
||||
"nanjing forestry university",
|
||||
"njfu",
|
||||
"南京信息工程大学",
|
||||
"南信大",
|
||||
"nanjing university of information science and technology",
|
||||
"nuist",
|
||||
"南京农业大学",
|
||||
"南农",
|
||||
"南农大",
|
||||
"南京农大",
|
||||
"agricultural university of nanjing",
|
||||
"njau",
|
||||
"nau",
|
||||
"南京中医药大学",
|
||||
"南中医",
|
||||
"nanjing university of chinese medicine",
|
||||
"njucm",
|
||||
"中国药科大学",
|
||||
"中国药大",
|
||||
"china medicine university",
|
||||
"cpu",
|
||||
"南京师范大学",
|
||||
"南京师大",
|
||||
"南师大",
|
||||
"南师",
|
||||
"nanjing normal university",
|
||||
"nnu",
|
||||
"南昌大学",
|
||||
"昌大",
|
||||
"university of nanchang","nanchang university",
|
||||
"ncu",
|
||||
"辽宁大学",
|
||||
"辽大",
|
||||
"liaoning university",
|
||||
"lnu",
|
||||
"大连海事大学",
|
||||
"大连海大",
|
||||
"海大",
|
||||
"maritime affairs university of dalian",
|
||||
"dmu",
|
||||
"内蒙古大学",
|
||||
"内大",
|
||||
"university of the inner mongol","inner mongolia university",
|
||||
"imu",
|
||||
"宁夏大学",
|
||||
"宁大",
|
||||
"ningxia university",
|
||||
"nxu",
|
||||
"青海大学",
|
||||
"清大",
|
||||
"qinghai university",
|
||||
"qhu",
|
||||
"中国石油大学",
|
||||
"中石大",
|
||||
"china university of petroleum beijing",
|
||||
"upc",
|
||||
"太原理工大学",
|
||||
"太原理工",
|
||||
"institutes of technology of taiyuan","taiyuan university of technology",
|
||||
"tyut",
|
||||
"西北大学",
|
||||
"西大",
|
||||
"northwest university",
|
||||
"nwu",
|
||||
"西安电子科技大学",
|
||||
"西电",
|
||||
"xidian university",
|
||||
"xdu",
|
||||
"长安大学",
|
||||
"长大",
|
||||
"chang`an university",
|
||||
"chu",
|
||||
"陕西师范大学",
|
||||
"陕西师大",
|
||||
"陕师大",
|
||||
"shaanxi normal university",
|
||||
"snnu",
|
||||
"第四军医大学",
|
||||
"空军军医大学","四医大",
|
||||
"air force medical university",
|
||||
"fmmu",
|
||||
"华东理工大学",
|
||||
"华理",
|
||||
"east china university of science",
|
||||
"ecust",
|
||||
"东华大学",
|
||||
"东华",
|
||||
"donghua university",
|
||||
"dhu",
|
||||
"上海海洋大学",
|
||||
"上海海大",
|
||||
"shanghai ocean university",
|
||||
"shou",
|
||||
"上海中医药大学",
|
||||
"上中医",
|
||||
"shanghai university of traditional chinese medicine",
|
||||
"shutcm",
|
||||
"上海外国语大学",
|
||||
"上外",
|
||||
"shanghai international studies university",
|
||||
"sisu",
|
||||
"上海财经大学",
|
||||
"上海财大",
|
||||
"上财",
|
||||
"shanghai university of finance",
|
||||
"sufe",
|
||||
"上海体育学院",
|
||||
"shanghai university of sport",
|
||||
"上海音乐学院",
|
||||
"上音",
|
||||
"shanghai conservatory of music",
|
||||
"shcm",
|
||||
"上海大学",
|
||||
"上大",
|
||||
"shanghai university",
|
||||
"第二军医大学",
|
||||
"海军军医大学",
|
||||
"naval medical university",
|
||||
"西南交通大学",
|
||||
"西南交大",
|
||||
"southwest jiaotong university",
|
||||
"swjtu",
|
||||
"西南石油大学",
|
||||
"西南石大",
|
||||
"southwest petroleum university",
|
||||
"swpu",
|
||||
"成都理工大学",
|
||||
"成都理工",
|
||||
"chengdu university of technology",
|
||||
"cdut ",
|
||||
"四川农业大学",
|
||||
"川农",
|
||||
"川农大",
|
||||
"sichuan agricultural university",
|
||||
"sicau",
|
||||
"成都中医药大学",
|
||||
"成中医",
|
||||
"chengdu university of tcm",
|
||||
"cdutcm",
|
||||
"西南财经大学",
|
||||
"西南财大",
|
||||
"西财",
|
||||
"southwestern university of finance and economics",
|
||||
"swufe",
|
||||
"天津工业大学",
|
||||
"天工大",
|
||||
"tianjin university of technology",
|
||||
"tgu",
|
||||
"天津医科大学",
|
||||
"天津医大",
|
||||
"medical university of tianjin",
|
||||
"tmu",
|
||||
"天津中医药大学",
|
||||
"天中",
|
||||
"tianjin university of traditional chinese medicine",
|
||||
"tutcm",
|
||||
"华北电力大学",
|
||||
"华电",
|
||||
"north china electric power university",
|
||||
"ncepu",
|
||||
"河北工业大学",
|
||||
"河工大",
|
||||
"hebei university of technology",
|
||||
"hebut",
|
||||
"西藏大学",
|
||||
"藏大",
|
||||
"tibet university",
|
||||
"tu",
|
||||
"石河子大学",
|
||||
"石大",
|
||||
"shihezi university",
|
||||
"中国美术学院",
|
||||
"中国美院",
|
||||
"国美",
|
||||
"china academy of art",
|
||||
"caa",
|
||||
"宁波大学",
|
||||
"宁大",
|
||||
"ningbo university",
|
||||
"nbu",
|
||||
"西南大学",
|
||||
"西大",
|
||||
"southwest university",
|
||||
"swu",
|
||||
"安徽大学",
|
||||
"安大",
|
||||
"university of anhui",
|
||||
"ahu",
|
||||
"合肥工业大学",
|
||||
"合肥工大",
|
||||
"合工大",
|
||||
"hefei university of technology",
|
||||
"hfut",
|
||||
"中国地质大学",
|
||||
"地大",
|
||||
"china university of geosciences",
|
||||
"cug",
|
||||
"中国地质大学",
|
||||
"地大",
|
||||
"北京地大",
|
||||
"cugb",
|
||||
"中国矿业大学",
|
||||
"中国矿大",
|
||||
"china university of mining & technology",
|
||||
"cumtb",
|
||||
"中国石油大学",
|
||||
"中石大",
|
||||
"石大",
|
||||
"china university of petroleum",
|
||||
"cup",
|
||||
"中国石油大学",
|
||||
"中石大",
|
||||
"cup"]
|
||||
1627
deepdoc/parser/resume/entities/res/school.rank.csv
Normal file
1627
deepdoc/parser/resume/entities/res/school.rank.csv
Normal file
File diff suppressed because it is too large
Load Diff
5713
deepdoc/parser/resume/entities/res/schools.csv
Normal file
5713
deepdoc/parser/resume/entities/res/schools.csv
Normal file
File diff suppressed because it is too large
Load Diff
91
deepdoc/parser/resume/entities/schools.py
Normal file
91
deepdoc/parser/resume/entities/schools.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import copy
|
||||
import pandas as pd
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
TBL = pd.read_csv(
|
||||
os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
|
||||
).fillna("")
|
||||
TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
|
||||
GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r",encoding="utf-8"))
|
||||
GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
|
||||
|
||||
|
||||
def loadRank(fnm):
|
||||
global TBL
|
||||
TBL["rank"] = 1000000
|
||||
with open(fnm, "r", encoding="utf-8") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n").split(",")
|
||||
try:
|
||||
nm, rk = line[0].strip(), int(line[1])
|
||||
# assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
|
||||
TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
|
||||
|
||||
|
||||
def split(txt):
|
||||
tks = []
|
||||
for t in re.sub(r"[ \t]+", " ", txt).split():
|
||||
if (
|
||||
tks
|
||||
and re.match(r".*[a-zA-Z]$", tks[-1])
|
||||
and re.match(r"[a-zA-Z]", t)
|
||||
and tks
|
||||
):
|
||||
tks[-1] = tks[-1] + " " + t
|
||||
else:
|
||||
tks.append(t)
|
||||
return tks
|
||||
|
||||
|
||||
def select(nm):
|
||||
global TBL
|
||||
if not nm:
|
||||
return
|
||||
if isinstance(nm, list):
|
||||
nm = str(nm[0])
|
||||
nm = split(nm)[0]
|
||||
nm = str(nm).lower().strip()
|
||||
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||
nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
|
||||
nm = re.sub(r"大学.*学院", "大学", nm)
|
||||
tbl = copy.deepcopy(TBL)
|
||||
tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
|
||||
res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
|
||||
if res.empty:
|
||||
return
|
||||
|
||||
return json.loads(res.to_json(orient="records"))[0]
|
||||
|
||||
|
||||
def is_good(nm):
|
||||
global GOOD_SCH
|
||||
nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
|
||||
nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
|
||||
return nm in GOOD_SCH
|
||||
189
deepdoc/parser/resume/step_one.py
Normal file
189
deepdoc/parser/resume/step_one.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import json
|
||||
from deepdoc.parser.resume.entities import degrees, regions, industries
|
||||
|
||||
FIELDS = [
|
||||
"address STRING",
|
||||
"annual_salary int",
|
||||
"annual_salary_from int",
|
||||
"annual_salary_to int",
|
||||
"birth STRING",
|
||||
"card STRING",
|
||||
"certificate_obj string",
|
||||
"city STRING",
|
||||
"corporation_id int",
|
||||
"corporation_name STRING",
|
||||
"corporation_type STRING",
|
||||
"degree STRING",
|
||||
"discipline_name STRING",
|
||||
"education_obj string",
|
||||
"email STRING",
|
||||
"expect_annual_salary int",
|
||||
"expect_city_names string",
|
||||
"expect_industry_name STRING",
|
||||
"expect_position_name STRING",
|
||||
"expect_salary_from int",
|
||||
"expect_salary_to int",
|
||||
"expect_type STRING",
|
||||
"gender STRING",
|
||||
"industry_name STRING",
|
||||
"industry_names STRING",
|
||||
"is_deleted STRING",
|
||||
"is_fertility STRING",
|
||||
"is_house STRING",
|
||||
"is_management_experience STRING",
|
||||
"is_marital STRING",
|
||||
"is_oversea STRING",
|
||||
"language_obj string",
|
||||
"name STRING",
|
||||
"nation STRING",
|
||||
"phone STRING",
|
||||
"political_status STRING",
|
||||
"position_name STRING",
|
||||
"project_obj string",
|
||||
"responsibilities string",
|
||||
"salary_month int",
|
||||
"scale STRING",
|
||||
"school_name STRING",
|
||||
"self_remark string",
|
||||
"skill_obj string",
|
||||
"title_name STRING",
|
||||
"tob_resume_id STRING",
|
||||
"updated_at Timestamp",
|
||||
"wechat STRING",
|
||||
"work_obj string",
|
||||
"work_experience int",
|
||||
"work_start_time BIGINT"
|
||||
]
|
||||
|
||||
def refactor(df):
|
||||
def deal_obj(obj, k, kk):
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
obj = obj.get(k, {})
|
||||
if not isinstance(obj, type({})):
|
||||
return ""
|
||||
return obj.get(kk, "")
|
||||
|
||||
def loadjson(line):
|
||||
try:
|
||||
return json.loads(line)
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
|
||||
df.fillna("", inplace=True)
|
||||
|
||||
clms = ["tob_resume_id", "updated_at"]
|
||||
|
||||
def extract(nms, cc=None):
|
||||
nonlocal clms
|
||||
clms.extend(nms)
|
||||
for c in nms:
|
||||
if cc:
|
||||
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
|
||||
else:
|
||||
df[c] = df["obj"].map(
|
||||
lambda x: json.dumps(
|
||||
x.get(
|
||||
c,
|
||||
{}),
|
||||
ensure_ascii=False) if isinstance(
|
||||
x,
|
||||
type(
|
||||
{})) and (
|
||||
isinstance(
|
||||
x.get(c),
|
||||
type(
|
||||
{})) or not x.get(c)) else str(x).replace(
|
||||
"None",
|
||||
""))
|
||||
|
||||
extract(["education", "work", "certificate", "project", "language",
|
||||
"skill"])
|
||||
extract(["wechat", "phone", "is_deleted",
|
||||
"name", "tel", "email"], "contact")
|
||||
extract(["nation", "expect_industry_name", "salary_month",
|
||||
"industry_ids", "is_house", "birth", "annual_salary_from",
|
||||
"annual_salary_to", "card",
|
||||
"expect_salary_to", "expect_salary_from",
|
||||
"expect_position_name", "gender", "city",
|
||||
"is_fertility", "expect_city_names",
|
||||
"political_status", "title_name", "expect_annual_salary",
|
||||
"industry_name", "address", "position_name", "school_name",
|
||||
"corporation_id",
|
||||
"is_oversea", "responsibilities",
|
||||
"work_start_time", "degree", "management_experience",
|
||||
"expect_type", "corporation_type", "scale", "corporation_name",
|
||||
"self_remark", "annual_salary", "work_experience",
|
||||
"discipline_name", "marital", "updated_at"], "basic")
|
||||
|
||||
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
|
||||
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
|
||||
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
|
||||
str(x).split(",")]))
|
||||
clms.append("industry_names")
|
||||
|
||||
def arr2str(a):
|
||||
if not a:
|
||||
return ""
|
||||
if isinstance(a, list):
|
||||
a = " ".join([str(i) for i in a])
|
||||
return str(a).replace(",", " ")
|
||||
|
||||
df["expect_industry_name"] = df["expect_industry_name"].map(
|
||||
lambda x: arr2str(x))
|
||||
df["gender"] = df["gender"].map(
|
||||
lambda x: "男" if x == 'M' else (
|
||||
"女" if x == 'F' else ""))
|
||||
for c in ["is_fertility", "is_oversea", "is_house",
|
||||
"management_experience", "marital"]:
|
||||
df[c] = df[c].map(
|
||||
lambda x: '是' if x == 'Y' else (
|
||||
'否' if x == 'N' else ""))
|
||||
df["is_management_experience"] = df["management_experience"]
|
||||
df["is_marital"] = df["marital"]
|
||||
clms.extend(["is_management_experience", "is_marital"])
|
||||
|
||||
df.fillna("", inplace=True)
|
||||
for i in range(len(df)):
|
||||
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
|
||||
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
|
||||
|
||||
for n in ["industry_ids", "management_experience", "marital", "tel"]:
|
||||
for i in range(len(clms)):
|
||||
if clms[i] == n:
|
||||
del clms[i]
|
||||
break
|
||||
|
||||
clms = list(set(clms))
|
||||
|
||||
df = df.reindex(sorted(clms), axis=1)
|
||||
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
|
||||
for c in clms:
|
||||
df[c] = df[c].map(
|
||||
lambda s: str(s).replace(
|
||||
"\t",
|
||||
" ").replace(
|
||||
"\n",
|
||||
"\\n").replace(
|
||||
"\r",
|
||||
"\\n"))
|
||||
# print(df.values.tolist())
|
||||
return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
|
||||
696
deepdoc/parser/resume/step_two.py
Normal file
696
deepdoc/parser/resume/step_two.py
Normal file
@@ -0,0 +1,696 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import copy
|
||||
import time
|
||||
import datetime
|
||||
import demjson3
|
||||
import traceback
|
||||
import signal
|
||||
import numpy as np
|
||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||
from rag.nlp import rag_tokenizer, surname
|
||||
from xpinyin import Pinyin
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def time_limit(seconds):
|
||||
def signal_handler(signum, frame):
|
||||
raise TimeoutException("Timed out!")
|
||||
|
||||
signal.signal(signal.SIGALRM, signal_handler)
|
||||
signal.alarm(seconds)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
|
||||
|
||||
ENV = None
|
||||
PY = Pinyin()
|
||||
|
||||
|
||||
def rmHtmlTag(line):
|
||||
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def highest_degree(dg):
|
||||
if not dg:
|
||||
return ""
|
||||
if isinstance(dg, str):
|
||||
dg = [dg]
|
||||
m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
|
||||
return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
|
||||
|
||||
|
||||
def forEdu(cv):
|
||||
if not cv.get("education_obj"):
|
||||
cv["integerity_flt"] *= 0.8
|
||||
return cv
|
||||
|
||||
first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
|
||||
edu_nst = []
|
||||
edu_end_dt = ""
|
||||
cv["school_rank_int"] = 1000000
|
||||
for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
|
||||
e = {}
|
||||
if n.get("end_time"):
|
||||
if n["end_time"] > edu_end_dt:
|
||||
edu_end_dt = n["end_time"]
|
||||
try:
|
||||
dt = n["end_time"]
|
||||
if re.match(r"[0-9]{9,}", dt):
|
||||
dt = turnTm2Dt(dt)
|
||||
y, m, d = getYMD(dt)
|
||||
ed_dt.append(str(y))
|
||||
e["end_dt_kwd"] = str(y)
|
||||
except Exception as e:
|
||||
pass
|
||||
if n.get("start_time"):
|
||||
try:
|
||||
dt = n["start_time"]
|
||||
if re.match(r"[0-9]{9,}", dt):
|
||||
dt = turnTm2Dt(dt)
|
||||
y, m, d = getYMD(dt)
|
||||
st_dt.append(str(y))
|
||||
e["start_dt_kwd"] = str(y)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
r = schools.select(n.get("school_name", ""))
|
||||
if r:
|
||||
if str(r.get("type", "")) == "1":
|
||||
fea.append("211")
|
||||
if str(r.get("type", "")) == "2":
|
||||
fea.append("211")
|
||||
if str(r.get("is_abroad", "")) == "1":
|
||||
fea.append("留学")
|
||||
if str(r.get("is_double_first", "")) == "1":
|
||||
fea.append("双一流")
|
||||
if str(r.get("is_985", "")) == "1":
|
||||
fea.append("985")
|
||||
if str(r.get("is_world_known", "")) == "1":
|
||||
fea.append("海外知名")
|
||||
if r.get("rank") and cv["school_rank_int"] > r["rank"]:
|
||||
cv["school_rank_int"] = r["rank"]
|
||||
|
||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||
e["sch_nm_kwd"] = sch[-1]
|
||||
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
|
||||
|
||||
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||
maj.append(n["discipline_name"])
|
||||
e["major_kwd"] = n["discipline_name"]
|
||||
|
||||
if not n.get("degree") and "985" in fea and not first_fea:
|
||||
n["degree"] = "1"
|
||||
|
||||
if n.get("degree"):
|
||||
d = degrees.get_name(n["degree"])
|
||||
if d:
|
||||
e["degree_kwd"] = d
|
||||
if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
|
||||
d = "专升本"
|
||||
if d:
|
||||
deg.append(d)
|
||||
|
||||
# for first degree
|
||||
if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
|
||||
fdeg = [d]
|
||||
if n.get("school_name"):
|
||||
fsch = [n["school_name"]]
|
||||
if n.get("discipline_name"):
|
||||
fmaj = [n["discipline_name"]]
|
||||
first_fea = copy.deepcopy(fea)
|
||||
|
||||
edu_nst.append(e)
|
||||
|
||||
cv["sch_rank_kwd"] = []
|
||||
if cv["school_rank_int"] <= 20 \
|
||||
or ("海外名校" in fea and cv["school_rank_int"] <= 200):
|
||||
cv["sch_rank_kwd"].append("顶尖学校")
|
||||
elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
|
||||
or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
|
||||
cv["school_rank_int"] > 200):
|
||||
cv["sch_rank_kwd"].append("精英学校")
|
||||
elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
|
||||
or ("海外名校" in fea and cv["school_rank_int"] > 500):
|
||||
cv["sch_rank_kwd"].append("优质学校")
|
||||
else:
|
||||
cv["sch_rank_kwd"].append("一般学校")
|
||||
|
||||
if edu_nst:
|
||||
cv["edu_nst"] = edu_nst
|
||||
if fea:
|
||||
cv["edu_fea_kwd"] = list(set(fea))
|
||||
if first_fea:
|
||||
cv["edu_first_fea_kwd"] = list(set(first_fea))
|
||||
if maj:
|
||||
cv["major_kwd"] = maj
|
||||
if fsch:
|
||||
cv["first_school_name_kwd"] = fsch
|
||||
if fdeg:
|
||||
cv["first_degree_kwd"] = fdeg
|
||||
if fmaj:
|
||||
cv["first_major_kwd"] = fmaj
|
||||
if st_dt:
|
||||
cv["edu_start_kwd"] = st_dt
|
||||
if ed_dt:
|
||||
cv["edu_end_kwd"] = ed_dt
|
||||
if ed_dt:
|
||||
cv["edu_end_int"] = max([int(t) for t in ed_dt])
|
||||
if deg:
|
||||
if "本科" in deg and "专科" in deg:
|
||||
deg.append("专升本")
|
||||
deg = [d for d in deg if d != '本科']
|
||||
cv["degree_kwd"] = deg
|
||||
cv["highest_degree_kwd"] = highest_degree(deg)
|
||||
if edu_end_dt:
|
||||
try:
|
||||
if re.match(r"[0-9]{9,}", edu_end_dt):
|
||||
edu_end_dt = turnTm2Dt(edu_end_dt)
|
||||
if edu_end_dt.strip("\n") == "至今":
|
||||
edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
|
||||
y, m, d = getYMD(edu_end_dt)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
|
||||
if sch:
|
||||
cv["school_name_kwd"] = sch
|
||||
if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
|
||||
or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
|
||||
or not cv.get("degree_kwd"):
|
||||
for c in sch:
|
||||
if schools.is_good(c):
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好学校")
|
||||
cv["tag_kwd"].append("好学历")
|
||||
break
|
||||
if (len(cv.get("degree_kwd", [])) >= 1 and \
|
||||
"本科" in cv["degree_kwd"] and \
|
||||
any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
|
||||
or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
|
||||
or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
if "好学历" not in cv["tag_kwd"]:
|
||||
cv["tag_kwd"].append("好学历")
|
||||
|
||||
if cv.get("major_kwd"):
|
||||
cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
||||
if cv.get("school_name_kwd"):
|
||||
cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
||||
if cv.get("first_school_name_kwd"):
|
||||
cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
||||
if cv.get("first_major_kwd"):
|
||||
cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
||||
|
||||
return cv
|
||||
|
||||
|
||||
def forProj(cv):
|
||||
if not cv.get("project_obj"):
|
||||
return cv
|
||||
|
||||
pro_nms, desc = [], []
|
||||
for i, n in enumerate(
|
||||
sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
|
||||
reverse=True)):
|
||||
if n.get("name"):
|
||||
pro_nms.append(n["name"])
|
||||
if n.get("describe"):
|
||||
desc.append(str(n["describe"]))
|
||||
if n.get("responsibilities"):
|
||||
desc.append(str(n["responsibilities"]))
|
||||
if n.get("achivement"):
|
||||
desc.append(str(n["achivement"]))
|
||||
|
||||
if pro_nms:
|
||||
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
||||
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
|
||||
if desc:
|
||||
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
|
||||
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
|
||||
|
||||
return cv
|
||||
|
||||
|
||||
def json_loads(line):
|
||||
return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
|
||||
|
||||
|
||||
def forWork(cv):
|
||||
if not cv.get("work_obj"):
|
||||
cv["integerity_flt"] *= 0.7
|
||||
return cv
|
||||
|
||||
flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
|
||||
"industry_name", "subordinates_count"]
|
||||
duas = []
|
||||
scales = []
|
||||
fea = {c: [] for c in flds}
|
||||
latest_job_tm = ""
|
||||
goodcorp = False
|
||||
goodcorp_ = False
|
||||
work_st_tm = ""
|
||||
corp_tags = []
|
||||
for i, n in enumerate(
|
||||
sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
|
||||
reverse=True)):
|
||||
if isinstance(n, str):
|
||||
try:
|
||||
n = json_loads(n)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
|
||||
work_st_tm = n["start_time"]
|
||||
for c in flds:
|
||||
if not n.get(c) or str(n[c]) == '0':
|
||||
fea[c].append("")
|
||||
continue
|
||||
if c == "corporation_name":
|
||||
n[c] = corporations.corpNorm(n[c], False)
|
||||
if corporations.is_good(n[c]):
|
||||
if i == 0:
|
||||
goodcorp = True
|
||||
else:
|
||||
goodcorp_ = True
|
||||
ct = corporations.corp_tag(n[c])
|
||||
if i == 0:
|
||||
corp_tags.extend(ct)
|
||||
elif ct and ct[0] != "软外":
|
||||
corp_tags.extend([f"{t}(曾)" for t in ct])
|
||||
|
||||
fea[c].append(rmHtmlTag(str(n[c]).lower()))
|
||||
|
||||
y, m, d = getYMD(n.get("start_time"))
|
||||
if not y or not m:
|
||||
continue
|
||||
st = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
latest_job_tm = st
|
||||
|
||||
y, m, d = getYMD(n.get("end_time"))
|
||||
if (not y or not m) and i > 0:
|
||||
continue
|
||||
if not y or not m or int(y) > 2022:
|
||||
y, m, d = getYMD(str(n.get("updated_at", "")))
|
||||
if not y or not m:
|
||||
continue
|
||||
ed = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
|
||||
try:
|
||||
duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
|
||||
except Exception:
|
||||
logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
|
||||
|
||||
if n.get("scale"):
|
||||
r = re.search(r"^([0-9]+)", str(n["scale"]))
|
||||
if r:
|
||||
scales.append(int(r.group(1)))
|
||||
|
||||
if goodcorp:
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好公司")
|
||||
if goodcorp_:
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].append("好公司(曾)")
|
||||
|
||||
if corp_tags:
|
||||
if "tag_kwd" not in cv:
|
||||
cv["tag_kwd"] = []
|
||||
cv["tag_kwd"].extend(corp_tags)
|
||||
cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
|
||||
|
||||
if latest_job_tm:
|
||||
cv["latest_job_dt"] = latest_job_tm
|
||||
if fea["corporation_id"]:
|
||||
cv["corporation_id"] = fea["corporation_id"]
|
||||
|
||||
if fea["position_name"]:
|
||||
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
||||
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
|
||||
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
|
||||
|
||||
if fea["industry_name"]:
|
||||
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
|
||||
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
|
||||
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
|
||||
|
||||
if fea["corporation_name"]:
|
||||
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
||||
cv["corp_nm_kwd"] = fea["corporation_name"]
|
||||
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
|
||||
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
|
||||
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
|
||||
|
||||
if fea["responsibilities"]:
|
||||
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
||||
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
||||
|
||||
if fea["subordinates_count"]:
|
||||
fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||
re.match(r"[^0-9]+$", str(i))]
|
||||
if fea["subordinates_count"]:
|
||||
cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
|
||||
|
||||
if isinstance(cv.get("corporation_id"), int):
|
||||
cv["corporation_id"] = [str(cv["corporation_id"])]
|
||||
if not cv.get("corporation_id"):
|
||||
cv["corporation_id"] = []
|
||||
for i in cv.get("corporation_id", []):
|
||||
cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
|
||||
|
||||
if work_st_tm:
|
||||
try:
|
||||
if re.match(r"[0-9]{9,}", work_st_tm):
|
||||
work_st_tm = turnTm2Dt(work_st_tm)
|
||||
y, m, d = getYMD(work_st_tm)
|
||||
cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
|
||||
except Exception as e:
|
||||
logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
|
||||
|
||||
cv["job_num_int"] = 0
|
||||
if duas:
|
||||
cv["dua_flt"] = np.mean(duas)
|
||||
cv["cur_dua_int"] = duas[0]
|
||||
cv["job_num_int"] = len(duas)
|
||||
if scales:
|
||||
cv["scale_flt"] = np.max(scales)
|
||||
return cv
|
||||
|
||||
|
||||
def turnTm2Dt(b):
|
||||
if not b:
|
||||
return
|
||||
b = str(b).strip()
|
||||
if re.match(r"[0-9]{10,}", b):
|
||||
b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
|
||||
return b
|
||||
|
||||
|
||||
def getYMD(b):
|
||||
y, m, d = "", "", "01"
|
||||
if not b:
|
||||
return (y, m, d)
|
||||
b = turnTm2Dt(b)
|
||||
if re.match(r"[0-9]{4}", b):
|
||||
y = int(b[:4])
|
||||
r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
|
||||
if r:
|
||||
m = r.group(1)
|
||||
r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
|
||||
if r:
|
||||
d = r.group(1)
|
||||
if not d or int(d) == 0 or int(d) > 31:
|
||||
d = "1"
|
||||
if not m or int(m) > 12 or int(m) < 1:
|
||||
m = "1"
|
||||
return (y, m, d)
|
||||
|
||||
|
||||
def birth(cv):
|
||||
if not cv.get("birth"):
|
||||
cv["integerity_flt"] *= 0.9
|
||||
return cv
|
||||
y, m, d = getYMD(cv["birth"])
|
||||
if not m or not y:
|
||||
return cv
|
||||
b = "%s-%02d-%02d" % (y, int(m), int(d))
|
||||
cv["birth_dt"] = b
|
||||
cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
|
||||
|
||||
cv["age_int"] = datetime.datetime.now().year - int(y)
|
||||
return cv
|
||||
|
||||
|
||||
def parse(cv):
|
||||
for k in cv.keys():
|
||||
if cv[k] == '\\N':
|
||||
cv[k] = ''
|
||||
# cv = cv.asDict()
|
||||
tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
|
||||
"expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
|
||||
"position_name", "school_name", "self_remark", "title_name"]
|
||||
small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
|
||||
kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
|
||||
"expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
|
||||
"industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
|
||||
num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
|
||||
"expect_salary_to", "salary_month"]
|
||||
|
||||
is_fld = [
|
||||
("is_fertility", "已育", "未育"),
|
||||
("is_house", "有房", "没房"),
|
||||
("is_management_experience", "有管理经验", "无管理经验"),
|
||||
("is_marital", "已婚", "未婚"),
|
||||
("is_oversea", "有海外经验", "无海外经验")
|
||||
]
|
||||
|
||||
rmkeys = []
|
||||
for k in cv.keys():
|
||||
if cv[k] is None:
|
||||
rmkeys.append(k)
|
||||
if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
|
||||
rmkeys.append(k)
|
||||
for k in rmkeys:
|
||||
del cv[k]
|
||||
|
||||
integerity = 0.
|
||||
flds_num = 0.
|
||||
|
||||
def hasValues(flds):
|
||||
nonlocal integerity, flds_num
|
||||
flds_num += len(flds)
|
||||
for f in flds:
|
||||
v = str(cv.get(f, ""))
|
||||
if len(v) > 0 and v != '0' and v != '[]':
|
||||
integerity += 1
|
||||
|
||||
hasValues(tks_fld)
|
||||
hasValues(small_tks_fld)
|
||||
hasValues(kwd_fld)
|
||||
hasValues(num_fld)
|
||||
cv["integerity_flt"] = integerity / flds_num
|
||||
|
||||
if cv.get("corporation_type"):
|
||||
for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
|
||||
(r"[//.· <\((]+.*", ""),
|
||||
(r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
|
||||
(r".*(机关|事业).*", "机关"),
|
||||
(r".*(非盈利|Non-profit).*", "非盈利"),
|
||||
(r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
|
||||
(r".*国有.*", "国企"),
|
||||
(r"[ ()\(\)人/·0-9-]+", ""),
|
||||
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
|
||||
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
|
||||
if len(cv["corporation_type"]) < 2:
|
||||
del cv["corporation_type"]
|
||||
|
||||
if cv.get("political_status"):
|
||||
for p, r in [
|
||||
(r".*党员.*", "党员"),
|
||||
(r".*(无党派|公民).*", "群众"),
|
||||
(r".*团员.*", "团员")]:
|
||||
cv["political_status"] = re.sub(p, r, cv["political_status"])
|
||||
if not re.search(r"[党团群]", cv["political_status"]):
|
||||
del cv["political_status"]
|
||||
|
||||
if cv.get("phone"):
|
||||
cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
|
||||
|
||||
keys = list(cv.keys())
|
||||
for k in keys:
|
||||
# deal with json objects
|
||||
if k.find("_obj") > 0:
|
||||
try:
|
||||
cv[k] = json_loads(cv[k])
|
||||
cv[k] = [a for _, a in cv[k].items()]
|
||||
nms = []
|
||||
for n in cv[k]:
|
||||
if not isinstance(n, dict) or "name" not in n or not n.get("name"):
|
||||
continue
|
||||
n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
|
||||
if not n["name"]:
|
||||
continue
|
||||
nms.append(n["name"])
|
||||
if nms:
|
||||
t = k[:-4]
|
||||
cv[f"{t}_kwd"] = nms
|
||||
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
||||
except Exception:
|
||||
logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
|
||||
cv[k] = []
|
||||
|
||||
# tokenize fields
|
||||
if k in tks_fld:
|
||||
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
||||
if k in small_tks_fld:
|
||||
cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
||||
|
||||
# keyword fields
|
||||
if k in kwd_fld:
|
||||
cv[f"{k}_kwd"] = [n.lower()
|
||||
for n in re.split(r"[\t,,;;. ]",
|
||||
re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
|
||||
) if n]
|
||||
|
||||
if k in num_fld and cv.get(k):
|
||||
cv[f"{k}_int"] = cv[k]
|
||||
|
||||
cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
|
||||
# for name field
|
||||
if cv.get("name"):
|
||||
nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
|
||||
nm = re.sub(r"[ \t ]+", " ", nm)
|
||||
if re.match(r"[a-zA-Z ]+$", nm):
|
||||
if len(nm.split()) > 1:
|
||||
cv["name"] = nm
|
||||
else:
|
||||
nm = ""
|
||||
elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
|
||||
nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
|
||||
else:
|
||||
nm = ""
|
||||
cv["name"] = nm.strip()
|
||||
name = cv["name"]
|
||||
|
||||
# name pingyin and its prefix
|
||||
cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
|
||||
cv["name_py_pref0_tks"] = ""
|
||||
cv["name_py_pref_tks"] = ""
|
||||
for py in PY.get_pinyins(nm[:20], ''):
|
||||
for i in range(2, len(py) + 1):
|
||||
cv["name_py_pref_tks"] += " " + py[:i]
|
||||
for py in PY.get_pinyins(nm[:20], ' '):
|
||||
py = py.split()
|
||||
for i in range(1, len(py) + 1):
|
||||
cv["name_py_pref0_tks"] += " " + "".join(py[:i])
|
||||
|
||||
cv["name_kwd"] = name
|
||||
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||
cv["name_tks"] = (
|
||||
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||
) if name else ""
|
||||
else:
|
||||
cv["integerity_flt"] /= 2.
|
||||
|
||||
if cv.get("phone"):
|
||||
r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
|
||||
if not r:
|
||||
cv["phone"] = ""
|
||||
else:
|
||||
cv["phone"] = r.group(1)
|
||||
|
||||
# deal with date fields
|
||||
if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
|
||||
cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
|
||||
else:
|
||||
y, m, d = getYMD(str(cv.get("updated_at", "")))
|
||||
if not y:
|
||||
y = "2012"
|
||||
if not m:
|
||||
m = "01"
|
||||
if not d:
|
||||
d = "01"
|
||||
cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
# long text tokenize
|
||||
|
||||
if cv.get("responsibilities"):
|
||||
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||
|
||||
# for yes or no field
|
||||
fea = []
|
||||
for f, y, n in is_fld:
|
||||
if f not in cv:
|
||||
continue
|
||||
if cv[f] == '是':
|
||||
fea.append(y)
|
||||
if cv[f] == '否':
|
||||
fea.append(n)
|
||||
|
||||
if fea:
|
||||
cv["tag_kwd"] = fea
|
||||
|
||||
cv = forEdu(cv)
|
||||
cv = forProj(cv)
|
||||
cv = forWork(cv)
|
||||
cv = birth(cv)
|
||||
|
||||
cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
|
||||
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||
for j in cv.get("sch_rank_kwd", []):
|
||||
cv["corp_proj_sch_deg_kwd"][i] += "+" + j
|
||||
for i in range(len(cv["corp_proj_sch_deg_kwd"])):
|
||||
if cv.get("highest_degree_kwd"):
|
||||
cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
|
||||
|
||||
try:
|
||||
if not cv.get("work_exp_flt") and cv.get("work_start_time"):
|
||||
if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
|
||||
cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
|
||||
cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
|
||||
elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
|
||||
y, m, d = getYMD(str(cv["work_start_time"]))
|
||||
cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
|
||||
except Exception as e:
|
||||
logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
|
||||
if "work_exp_flt" not in cv and cv.get("work_experience", 0):
|
||||
cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
|
||||
|
||||
keys = list(cv.keys())
|
||||
for k in keys:
|
||||
if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
|
||||
del cv[k]
|
||||
for k in cv.keys():
|
||||
if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
|
||||
continue
|
||||
cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
|
||||
keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
|
||||
for k in keys:
|
||||
if cv[k] <= 0:
|
||||
del cv[k]
|
||||
|
||||
cv["tob_resume_id"] = str(cv["tob_resume_id"])
|
||||
cv["id"] = cv["tob_resume_id"]
|
||||
logging.debug("CCCCCCCCCCCCCCC")
|
||||
|
||||
return dealWithInt64(cv)
|
||||
|
||||
|
||||
def dealWithInt64(d):
|
||||
if isinstance(d, dict):
|
||||
for n, v in d.items():
|
||||
d[n] = dealWithInt64(v)
|
||||
|
||||
if isinstance(d, list):
|
||||
d = [dealWithInt64(t) for t in d]
|
||||
|
||||
if isinstance(d, np.integer):
|
||||
d = int(d)
|
||||
return d
|
||||
64
deepdoc/parser/txt_parser.py
Normal file
64
deepdoc/parser/txt_parser.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re
|
||||
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.nlp import num_tokens_from_string
|
||||
|
||||
|
||||
class RAGFlowTxtParser:
|
||||
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
||||
txt = get_text(fnm, binary)
|
||||
return self.parser_txt(txt, chunk_token_num, delimiter)
|
||||
|
||||
@classmethod
|
||||
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
||||
if not isinstance(txt, str):
|
||||
raise TypeError("txt type should be str!")
|
||||
cks = [""]
|
||||
tk_nums = [0]
|
||||
delimiter = delimiter.encode('utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
|
||||
|
||||
def add_chunk(t):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if tk_nums[-1] > chunk_token_num:
|
||||
cks.append(t)
|
||||
tk_nums.append(tnum)
|
||||
else:
|
||||
cks[-1] += t
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
dels = []
|
||||
s = 0
|
||||
for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
|
||||
f, t = m.span()
|
||||
dels.append(m.group(1))
|
||||
dels.extend(list(delimiter[s: f]))
|
||||
s = t
|
||||
if s < len(delimiter):
|
||||
dels.extend(list(delimiter[s:]))
|
||||
dels = [re.escape(d) for d in dels if d]
|
||||
dels = [d for d in dels if d]
|
||||
dels = "|".join(dels)
|
||||
secs = re.split(r"(%s)" % dels, txt)
|
||||
for sec in secs:
|
||||
if re.match(f"^{dels}$", sec):
|
||||
continue
|
||||
add_chunk(sec)
|
||||
|
||||
return [[c, ""] for c in cks]
|
||||
32
deepdoc/parser/utils.py
Normal file
32
deepdoc/parser/utils.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from rag.nlp import find_codec
|
||||
|
||||
|
||||
def get_text(fnm: str, binary=None) -> str:
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
txt += line
|
||||
return txt
|
||||
Reference in New Issue
Block a user