将flask改成fastapi

This commit is contained in:
2025-10-13 13:18:03 +08:00
commit 88db2539b0
476 changed files with 739741 additions and 0 deletions

15
rag/app/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

61
rag/app/audio.py Normal file
View File

@@ -0,0 +1,61 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import re
import tempfile
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from rag.nlp import rag_tokenizer, tokenize
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # is_english(sections)
try:
_, ext = os.path.splitext(filename)
if not ext:
raise RuntimeError("No extension detected.")
if ext not in [".da", ".wave", ".wav", ".mp3", ".wav", ".aac", ".flac", ".ogg", ".aiff", ".au", ".midi", ".wma", ".realaudio", ".vqf", ".oggvorbis", ".aac", ".ape"]:
raise RuntimeError(f"Extension {ext} is not supported yet.")
tmp_path = ""
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmpf:
tmpf.write(binary)
tmpf.flush()
tmp_path = os.path.abspath(tmpf.name)
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
ans = seq2txt_mdl.transcription(tmp_path)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
finally:
if tmp_path and os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except Exception:
pass
return []

160
rag/app/book.py Normal file
View File

@@ -0,0 +1,160 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
import re
from io import BytesIO
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._filter_forpages()
self._merge_with_same_bullet()
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections, tbls = [], []
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
doc_parser = DocxParser()
# TODO: table of contents need to be removed
sections, tbls = doc_parser(
binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
tbls = [((None, lns), None) for lns in tbls]
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
make_colon_as_title(sections)
bull = bullets_category(
[t for t in random_choices([t for t, _ in sections], k=100)])
if bull >= 0:
chunks = ["\n".join(ck)
for ck in hierarchical_merge(bull, sections, 5)]
else:
sections = [s.split("@") for s, _ in sections]
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
chunks = naive_merge(
sections, kwargs.get(
"chunk_token_num", 256), kwargs.get(
"delimer", "\n。;!?"))
# is it English
# is_english(random_choices([t for t, _ in sections], k=218))
eng = lang.lower() == "english"
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

117
rag/app/email.py Normal file
View File

@@ -0,0 +1,117 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from email import policy
from email.parser import BytesParser
from rag.app.naive import chunk as naive_chunk
import re
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
from deepdoc.parser import HtmlParser, TxtParser
from timeit import default_timer as timer
import io
def chunk(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
**kwargs,
):
"""
Only eml is supported
"""
eng = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config",
{"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
)
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
main_res = []
attachment_res = []
if binary:
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
else:
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
text_txt, html_txt = [], []
# get the email header info
for header, value in msg.items():
text_txt.append(f"{header}: {value}")
# get the email main info
def _add_content(msg, content_type):
if content_type == "text/plain":
text_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif content_type == "text/html":
html_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif "multipart" in content_type:
if msg.is_multipart():
for part in msg.iter_parts():
_add_content(part, part.get_content_type())
_add_content(msg, msg.get_content_type())
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
(line, "") for line in HtmlParser.parser_txt("\n".join(html_txt), chunk_token_num=parser_config["chunk_token_num"]) if line
]
st = timer()
chunks = naive_merge(
sections,
int(parser_config.get("chunk_token_num", 128)),
parser_config.get("delimiter", "\n!?。;!?"),
)
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
logging.debug("naive_merge({}): {}".format(filename, timer() - st))
# get the attachment info
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")
if content_disposition:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True)
try:
attachment_res.extend(
naive_chunk(filename, payload, callback=callback, **kwargs)
)
except Exception:
pass
return main_res + attachment_res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

213
rag/app/laws.py Normal file
View File

@@ -0,0 +1,213 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
import re
from io import BytesIO
from docx import Document
from api.db import ParserType
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, remove_contents_table, \
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
class Docx(DocxParser):
def __init__(self):
pass
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def old_call(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [line for line in lines if line]
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
level_set = set()
bull = bullets_category([p.text for p in self.doc.paragraphs])
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = docx_question_level(p, bull)
if not p_text.strip("\n"):
continue
lines.append((question_level, p_text))
level_set.add(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
sorted_levels = sorted(level_set)
h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1
h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level
root = Node(level=0, depth=h2_level, texts=[])
root.build_tree(lines)
return [("\n").join(element) for element in root.get_tree() if element]
def __str__(self) -> str:
return f'''
question:{self.question},
answer:{self.answer},
level:{self.level},
childs:{self.childs}
'''
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.LAWS.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts:".format(
))
self._naive_vertical_merge()
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections = []
# is it English
eng = lang.lower() == "english" # is_english(sections)
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunks = Docx()(filename, binary)
callback(0.7, "Finish parsing.")
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
for txt, poss in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)[0]:
sections.append(txt + poss)
elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
# Remove 'Contents' part
remove_contents_table(sections, eng)
make_colon_as_title(sections)
bull = bullets_category(sections)
res = tree_merge(bull, sections, 2)
if not res:
callback(0.99, "No chunk parsed out.")
return tokenize_chunks(res, doc, eng, pdf_parser)
# chunks = hierarchical_merge(bull, sections, 5)
# return tokenize_chunks(["\n".join(ck)for ck in chunks], doc, eng, pdf_parser)
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

285
rag/app/manual.py Normal file
View File

@@ -0,0 +1,285 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import re
from api.db import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser
from docx import Document
from PIL import Image
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.MANUAL.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.debug("OCR: {}".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts: {}".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
self._filter_forpages()
callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
# clean mess
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)], tbls
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
if not img:
return None
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
image = related_part.image
image = Image.open(BytesIO(image.blob))
return image
def concat_img(self, img1, img2):
if img1 and not img2:
return img1
if not img1 and img2:
return img2
if not img1 and not img2:
return None
width1, height1 = img1.size
width2, height2 = img2.size
new_width = max(width1, width2)
new_height = height1 + height2
new_image = Image.new('RGB', (new_width, new_height))
new_image.paste(img1, (0, 0))
new_image.paste(img2, (0, height1))
return new_image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
last_answer, last_image = "", None
question_stack, level_stack = [], []
ti_list = []
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = 0, ''
if from_page <= pn < to_page and p.text.strip():
question_level, p_text = docx_question_level(p)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
current_image = self.get_picture(self.doc, p)
last_image = self.concat_img(last_image, current_image)
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
last_answer, last_image = '', None
i = question_level
while question_stack and i <= level_stack[-1]:
question_stack.pop()
level_stack.pop()
question_stack.append(p_text)
level_stack.append(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
if last_answer:
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
tbls = []
for tb in self.doc.tables:
html= "<table>"
for r in tb.rows:
html += "<tr>"
i = 0
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i+1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
else:
break
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return ti_list, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
pdf_parser = None
doc = {
"docnm_kwd": filename
}
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
if sections and len(sections[0]) < 3:
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1)
levels = []
for txt, _, _ in sections:
for t, lvl in pdf_parser.outlines:
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
tks_ = set([txt[i] + txt[i + 1]
for i in range(min(len(t), len(txt) - 1))])
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
levels.append(lvl)
break
else:
levels.append(max_lvl + 1)
else:
bull = bullets_category([txt for txt, _, _ in sections])
most_level, levels = title_frequency(
bull, [(txt, lvl) for txt, lvl, _ in sections])
assert len(sections) == len(levels)
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
sections = [(txt, sec_ids[i], poss)
for i, (txt, _, poss) in enumerate(sections)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0], -1,
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
def tag(pn, left, right, top, bottom):
if pn + left + right + top + bottom == 0:
return ""
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(pn, left, right, top, bottom)
chunks = []
last_sid = -2
tk_cnt = 0
for txt, sec_id, poss in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
poss = "\t".join([tag(*pos) for pos in poss])
if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
if chunks:
chunks[-1] += "\n" + txt + poss
tk_cnt += num_tokens_from_string(txt)
continue
chunks.append(txt + poss)
tk_cnt = num_tokens_from_string(txt)
if sec_id > -1:
last_sid = sec_id
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
res = tokenize_table(tbls, doc, eng)
for text, image in ti_list:
d = copy.deepcopy(doc)
if image:
d['image'] = image
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
return res
else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

603
rag/app/naive.py Normal file
View File

@@ -0,0 +1,603 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
from functools import reduce
from io import BytesIO
from timeit import default_timer as timer
from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
from tika import parser
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
imgs = paragraph._element.xpath('.//pic:pic')
if not imgs:
return None
res_img = None
for img in imgs:
embed = img.xpath('.//a:blip/@r:embed')
if not embed:
continue
embed = embed[0]
try:
related_part = document.part.related_parts[embed]
image_blob = related_part.image.blob
except UnrecognizedImageError:
logging.info("Unrecognized image format. Skipping image.")
continue
except UnexpectedEndOfFileError:
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
continue
except InvalidImageStreamError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue
except UnicodeDecodeError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue
except Exception:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
if res_img is None:
res_img = image
else:
res_img = concat_img(res_img, image)
except Exception:
continue
return res_img
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def __get_nearest_title(self, table_index, filename):
"""Get the hierarchical title structure before the table"""
import re
from docx.text.paragraph import Paragraph
titles = []
blocks = []
# Get document name from filename parameter
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
if not doc_name:
doc_name = "Untitled Document"
# Collect all document blocks while maintaining document order
try:
# Iterate through all paragraphs and tables in document order
for i, block in enumerate(self.doc._element.body):
if block.tag.endswith('p'): # Paragraph
p = Paragraph(block, self.doc)
blocks.append(('p', i, p))
elif block.tag.endswith('tbl'): # Table
blocks.append(('t', i, None)) # Table object will be retrieved later
except Exception as e:
logging.error(f"Error collecting blocks: {e}")
return ""
# Find the target table position
target_table_pos = -1
table_count = 0
for i, (block_type, pos, _) in enumerate(blocks):
if block_type == 't':
if table_count == table_index:
target_table_pos = pos
break
table_count += 1
if target_table_pos == -1:
return "" # Target table not found
# Find the nearest heading paragraph in reverse order
nearest_title = None
for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
if level <= 7: # Support up to 7 heading levels
title_text = block.text.strip()
if title_text: # Avoid empty titles
nearest_title = (level, title_text)
break
except Exception as e:
logging.error(f"Error parsing heading level: {e}")
if nearest_title:
# Add current title
titles.append(nearest_title)
current_level = nearest_title[0]
# Find all parent headings, allowing cross-level search
while current_level > 1:
found = False
for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
# Find any heading with a higher level
if level < current_level:
title_text = block.text.strip()
if title_text: # Avoid empty titles
titles.append((level, title_text))
current_level = level
found = True
break
except Exception as e:
logging.error(f"Error parsing parent heading: {e}")
if not found: # Break if no parent heading is found
break
# Sort by level (ascending, from highest to lowest)
titles.sort(key=lambda x: x[0])
# Organize titles (from highest to lowest)
hierarchy = [doc_name] + [t[1] for t in titles]
return " > ".join(hierarchy)
return ""
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page:
if p.text.strip():
if p.style and p.style.name == 'Caption':
former_image = None
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
former_image = lines[-1][1].pop()
elif last_image:
former_image = last_image
last_image = None
lines.append((self.__clean(p.text), [former_image], p.style.name))
else:
current_image = self.get_picture(self.doc, p)
image_list = [current_image]
if last_image:
image_list.insert(0, last_image)
last_image = None
lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
else:
if current_image := self.get_picture(self.doc, p):
if lines:
lines[-1][1].append(current_image)
else:
last_image = current_image
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
tbls = []
for i, tb in enumerate(self.doc.tables):
title = self.__get_nearest_title(i, filename)
html = "<table>"
if title:
html += f"<caption>Table Location: {title}</caption>"
for r in tb.rows:
html += "<tr>"
i = 0
try:
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
else:
break
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
except Exception as e:
logging.warning(f"Error parsing table, ignore: {e}")
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return new_line, tbls
class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
start = timer()
first_start = start
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
if separate_tables_figures:
tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
self._concat_downward()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
else:
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._concat_downward()
# self._filter_forpages()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
class Markdown(MarkdownParser):
def get_picture_urls(self, sections):
if not sections:
return []
if isinstance(sections, type("")):
text = sections
elif isinstance(sections[0], type("")):
text = sections[0]
else:
return []
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
image_urls = self.get_picture_urls(text)
images = []
# Find all image URLs in text
for url in image_urls:
try:
# check if the url is a local file or a remote URL
if url.startswith(('http://', 'https://')):
# For remote URLs, download the image
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
img = Image.open(BytesIO(response.content)).convert('RGB')
images.append(img)
else:
# For local file paths, open the image directly
from pathlib import Path
local_path = Path(url)
if not local_path.exists():
logging.warning(f"Local image file not found: {url}")
continue
img = Image.open(url).convert('RGB')
images.append(img)
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
continue
return images if images else None
def __call__(self, filename, binary=None, separate_tables=True):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
extractor = MarkdownElementExtractor(txt)
element_sections = extractor.extract_elements()
sections = [(element, "") for element in element_sections]
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
return sections, tbls
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
section_images = None
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary)
if vision_model:
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
st = timer()
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
callback(0.1, "Start to parse.")
if layout_recognizer == "DeepDOC":
pdf_parser = Pdf()
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
try:
pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
boosted_figures = pdf_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
tables.extend(figures)
else:
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
else:
if layout_recognizer == "Plain Text":
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
parser_config["chunk_token_num"] = 12800
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary, separate_tables=False)
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
# Process images for each section
section_images = []
for idx, (section_text, _) in enumerate(sections):
images = markdown_parser.get_pictures(section_text) if section_text else None
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
section_images.append(combined_image)
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
section_images.append(None)
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = HtmlParser()(filename, binary, chunk_token_num)
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = JsonParser(chunk_token_num)(binary)
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
else:
callback(0.8, f"tika.parser got empty content from {filename}.")
logging.warning(f"tika.parser got empty content from {filename}.")
return []
else:
raise NotImplementedError(
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
if section_images:
# if all images are None, set section_images to None
if all(image is None for image in section_images):
section_images = None
if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

141
rag/app/one.py Normal file
View File

@@ -0,0 +1,141 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from tika import parser
from io import BytesIO
import re
from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug("layouts cost: {}s".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
eng = lang.lower() == "english" # is_english(cks)
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = naive.Docx()(filename, binary)
sections = [s for s, _ in sections if s]
for (_, html), _ in tbls:
sections.append(html)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, _ = pdf_parser(
filename if not binary else binary, to_page=to_page, callback=callback)
sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
sections = excel_parser.html(binary, 1000000000)
elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng)
return [doc]
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

297
rag/app/paper.py Normal file
View File

@@ -0,0 +1,297 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import copy
import re
from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.PAPER.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
logging.debug(f"layouts cost: {timer() - start}s")
start = timer()
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
tbls = self._extract_table_figure(True, zoomin, True, True)
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward()
self._filter_forpages()
callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
logging.debug("two_column................... {} {}".format(column_width,
self.page_images[0].size[0] / zoomin / 2))
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
def _begin(txt):
return re.match(
"[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
txt.lower().strip())
if from_page > 0:
return {
"title": "",
"authors": "",
"abstract": "",
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
# get title and authors
title = ""
authors = []
i = 0
while i < min(32, len(self.boxes)-1):
b = self.boxes[i]
i += 1
if b.get("layoutno", "").find("title") >= 0:
title = b["text"]
if _begin(title):
title = ""
break
for j in range(3):
if _begin(self.boxes[i + j]["text"]):
break
authors.append(self.boxes[i + j]["text"])
break
break
# get abstract
abstr = ""
i = 0
while i + 1 < min(32, len(self.boxes)):
b = self.boxes[i]
i += 1
txt = b["text"].lower().strip()
if re.match("(abstract|摘要)", txt):
if len(txt.split()) > 32 or len(txt) > 64:
abstr = txt + self._line_tag(b, zoomin)
break
txt = self.boxes[i]["text"].lower().strip()
if len(txt.split()) > 32 or len(txt) > 64:
abstr = txt + self._line_tag(self.boxes[i], zoomin)
i += 1
break
if not abstr:
i = 0
callback(
0.8, "Page {}~{}: Text merging finished".format(
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
logging.debug("{} {}".format(b["text"], b.get("layoutno")))
logging.debug("{}".format(tbls))
return {
"title": title,
"authors": " ".join(authors),
"abstract": abstr,
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE):
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
paper = {
"title": filename,
"authors": " ",
"abstract": "",
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
"tables": []
}
else:
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else:
raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
logging.debug("It's English.....{}".format(eng))
res = tokenize_table(paper["tables"], doc, eng)
if paper["abstract"]:
d = copy.deepcopy(doc)
txt = pdf_parser.remove_tag(paper["abstract"])
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
d["important_tks"] = " ".join(d["important_kwd"])
d["image"], poss = pdf_parser.crop(
paper["abstract"], need_position=True)
add_positions(d, poss)
tokenize(d, txt, eng)
res.append(d)
sorted_sections = paper["sections"]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
bull = bullets_category([txt for txt, _ in sorted_sections])
most_level, levels = title_frequency(bull, sorted_sections)
assert len(sorted_sections) == len(levels)
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
chunks = []
last_sid = -2
for (txt, _), sec_id in zip(sorted_sections, sec_ids):
if sec_id == last_sid:
if chunks:
chunks[-1] += "\n" + txt
continue
chunks.append(txt)
last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
"""
readed = [0] * len(paper["lines"])
# find colon firstly
i = 0
while i + 1 < len(paper["lines"]):
txt = pdf_parser.remove_tag(paper["lines"][i][0])
j = i
if txt.strip("\n").strip()[-1] not in ":":
i += 1
continue
i += 1
while i < len(paper["lines"]) and not paper["lines"][i][0]:
i += 1
if i >= len(paper["lines"]): break
proj = [paper["lines"][i][0].strip()]
i += 1
while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
proj.append(paper["lines"][i])
i += 1
for k in range(j, i): readed[k] = True
txt = txt[::-1]
if eng:
r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
txt = r.group(1)[::-1] if r else txt[::-1]
else:
r = re.search(r"(.*?) ([。?;!]|$)", txt)
txt = r.group(1)[::-1] if r else txt[::-1]
for p in proj:
d = copy.deepcopy(doc)
txt += "\n" + pdf_parser.remove_tag(p)
d["image"], poss = pdf_parser.crop(p, need_position=True)
add_positions(d, poss)
tokenize(d, txt, eng)
res.append(d)
i = 0
chunk = []
tk_cnt = 0
def add_chunk():
nonlocal chunk, res, doc, pdf_parser, tk_cnt
d = copy.deepcopy(doc)
ck = "\n".join(chunk)
tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
res.append(d)
chunk = []
tk_cnt = 0
while i < len(paper["lines"]):
if tk_cnt > 128:
add_chunk()
if readed[i]:
i += 1
continue
readed[i] = True
txt, layouts = paper["lines"][i]
txt_ = pdf_parser.remove_tag(txt)
i += 1
cnt = num_tokens_from_string(txt_)
if any([
layouts.find("title") >= 0 and chunk,
cnt + tk_cnt > 128 and tk_cnt > 32,
]):
add_chunk()
chunk = [txt]
tk_cnt = cnt
else:
chunk.append(txt)
tk_cnt += cnt
if chunk: add_chunk()
for i, d in enumerate(res):
print(d)
# d["image"].save(f"./logs/{i}.jpg")
return res
"""
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

91
rag/app/picture.py Normal file
View File

@@ -0,0 +1,91 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import re
import numpy as np
from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.vision import OCR
from rag.nlp import tokenize
from rag.utils import clean_markdown_block
from rag.nlp import rag_tokenizer
ocr = OCR()
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
img = Image.open(io.BytesIO(binary)).convert('RGB')
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
"image": img,
"doc_type_kwd": "image"
}
bxs = ocr(np.array(img))
txt = "\n".join([t[0] for _, t in bxs if t[0]])
eng = lang.lower() == "english"
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.")
return [doc]
try:
callback(0.4, "Use CV LLM to describe the picture.")
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
img_binary = io.BytesIO()
img.save(img_binary, format='JPEG')
img_binary.seek(0)
ans = cv_mdl.describe(img_binary.read())
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
return []
def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
"""
A simple wrapper to process image to markdown texts via VLM.
Returns:
Simple markdown texts generated by VLM.
"""
callback = callback or (lambda prog, msg: None)
img = binary
txt = ""
try:
with io.BytesIO() as img_binary:
img.save(img_binary, format='JPEG')
img_binary.seek(0)
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
txt += "\n" + ans
return txt
except Exception as e:
callback(-1, str(e))
return ""

168
rag/app/presentation.py Normal file
View File

@@ -0,0 +1,168 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
from io import BytesIO
from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser.pdf_parser import VisionParser
from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read
class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
txts = super().__call__(fnm, from_page, to_page)
callback(0.5, "Text extraction finished.")
import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page: to_page]):
try:
with BytesIO() as buffered:
slide.get_thumbnail(
0.1, 0.1).save(
buffered, drawing.imaging.ImageFormat.jpeg)
buffered.seek(0)
imgs.append(Image.open(buffered).copy())
except RuntimeError as e:
raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))]
class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __garbage(self, txt):
txt = txt.lower().strip()
if re.match(r"[0-9\.,%/-]+$", txt):
return True
if len(txt) < 3:
return True
return False
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(filename if not binary else binary,
zoomin, from_page, to_page, callback)
callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
len(self.boxes), len(self.page_images))
res = []
for i in range(len(self.boxes)):
lines = "\n".join([b["text"] for b in self.boxes[i]
if not self.__garbage(b["text"])])
res.append((lines, self.page_images[i]))
callback(0.9, "Page {}~{}: Parsing finished".format(
from_page, min(to_page, self.total_page)))
return res
class PlainPdf(PlainParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, callback=None, **kwargs):
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
page_txt = []
for page in self.pdf.pages[from_page: to_page]:
page_txt.append(page.extract_text())
callback(0.9, "Parsing finished")
return [(txt, None) for txt in page_txt]
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, parser_config=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
if parser_config is None:
parser_config = {}
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["doc_type_kwd"] = "image"
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng)
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if layout_recognizer == "DeepDOC":
pdf_parser = Pdf()
sections = pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
elif layout_recognizer == "Plain Text":
pdf_parser = PlainParser()
sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback)
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback)
callback(0.8, "Finish parsing.")
for pn, (txt, img) in enumerate(sections):
d = copy.deepcopy(doc)
pn += from_page
if img:
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res
raise NotImplementedError(
"file type not supported yet(pptx, pdf supported)")
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

471
rag/app/qa.py Normal file
View File

@@ -0,0 +1,471 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import csv
from copy import deepcopy
from io import BytesIO
from timeit import default_timer as timer
from openpyxl import load_workbook
from deepdoc.parser.utils import get_text
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document
from PIL import Image
from markdown import markdown
from rag.utils import get_float
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None):
if not binary:
wb = load_workbook(fnm)
else:
wb = load_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows))
res, fails = [], []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
for i, r in enumerate(rows):
q, a = "", ""
for cell in r:
if not cell.value:
continue
if not q:
q = str(cell.value)
elif not a:
a = str(cell.value)
else:
break
if q and a:
res.append((q, a))
else:
fails.append(str(i + 1))
if len(res) % 999 == 0:
callback(len(res) *
0.6 /
total, ("Extract pairs: {}".format(len(res)) +
(f"{len(fails)} failure, line: %s..." %
(",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
self.is_english = is_english(
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
return res
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge()
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
tbls = self._extract_table_figure(True, zoomin, True, True)
#self._naive_vertical_merge()
# self._concat_downward()
#self._filter_forpages()
logging.debug("layouts: {}".format(timer() - start))
sections = [b["text"] for b in self.boxes]
bull_x0_list = []
q_bull, reg = qbullets_category(sections)
if q_bull == -1:
raise ValueError("Unable to recognize Q&A structure.")
qai_list = []
last_q, last_a, last_tag = '', '', ''
last_index = -1
last_box = {'text':''}
last_bull = None
def sort_key(element):
tbls_pn = element[1][0][0]
tbls_top = element[1][0][3]
return tbls_pn, tbls_top
tbls.sort(key=sort_key)
tbl_index = 0
last_pn, last_bottom = 0, 0
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
for box in self.boxes:
section, line_tag = box['text'], self._line_tag(box, zoomin)
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
last_box, last_index, last_bull = box, index, has_bull
line_pn = get_float(line_tag.lstrip('@@').split('\t')[0])
line_top = get_float(line_tag.rstrip('##').split('\t')[3])
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
if not has_bull: # No question bullet
if not last_q:
if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed
tbl_index += 1
continue
else:
sum_tag = line_tag
sum_section = section
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the middle of current answer
sum_tag = f'{tbl_tag}{sum_tag}'
sum_section = f'{tbl_text}{sum_section}'
tbl_index += 1
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
last_a = f'{last_a}{sum_section}'
last_tag = f'{last_tag}{sum_tag}'
else:
if last_q:
while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \
and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the end of last answer
last_tag = f'{last_tag}{tbl_tag}'
last_a = f'{last_a}{tbl_text}'
tbl_index += 1
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
image, poss = self.crop(last_tag, need_position=True)
qai_list.append((last_q, last_a, image, poss))
last_q, last_a, last_tag = '', '', ''
last_q = has_bull.group()
_, end = has_bull.span()
last_a = section[end:]
last_tag = line_tag
last_bottom = float(line_tag.rstrip('##').split('\t')[4])
last_pn = line_pn
if last_q:
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
return qai_list, tbls
def get_tbls_info(self, tbls, tbl_index):
if tbl_index >= len(tbls):
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
tbl_pn = tbls[tbl_index][1][0][0]+1
tbl_left = tbls[tbl_index][1][0][1]
tbl_right = tbls[tbl_index][1][0][2]
tbl_top = tbls[tbl_index][1][0][3]
tbl_bottom = tbls[tbl_index][1][0][4]
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
_tbl_text = ''.join(tbls[tbl_index][0][1])
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, _tbl_text
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
if not img:
return None
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
image = related_part.image
image = Image.open(BytesIO(image.blob)).convert('RGB')
return image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
last_answer, last_image = "", None
question_stack, level_stack = [], []
qai_list = []
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = 0, ''
if from_page <= pn < to_page and p.text.strip():
question_level, p_text = docx_question_level(p)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
current_image = self.get_picture(self.doc, p)
last_image = concat_img(last_image, current_image)
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
if sum_question:
qai_list.append((sum_question, last_answer, last_image))
last_answer, last_image = '', None
i = question_level
while question_stack and i <= level_stack[-1]:
question_stack.pop()
level_stack.pop()
question_stack.append(p_text)
level_stack.append(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
if last_answer:
sum_question = '\n'.join(question_stack)
if sum_question:
qai_list.append((sum_question, last_answer, last_image))
tbls = []
for tb in self.doc.tables:
html= "<table>"
for r in tb.rows:
html += "<tr>"
i = 0
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i+1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return qai_list, tbls
def rmPrefix(txt):
return re.sub(
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t: ]+", "", txt.strip(), flags=re.IGNORECASE)
def beAdocPdf(d, q, a, eng, image, poss):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
if image:
d["image"] = image
d["doc_type_kwd"] = "image"
add_positions(d, poss)
return d
def beAdocDocx(d, q, a, eng, image, row_num=-1):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
if image:
d["image"] = image
d["doc_type_kwd"] = "image"
if row_num >= 0:
d["top_int"] = [row_num]
return d
def beAdoc(d, q, a, eng, row_num=-1):
qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
if row_num >= 0:
d["top_int"] = [row_num]
return d
def mdQuestionLevel(s):
match = re.match(r'#*', s)
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
And question column is ahead of answer column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk.
"""
eng = lang.lower() == "english"
res = []
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
return res
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
comma, tab = 0, 0
for line in lines:
if len(line.split(",")) == 2:
comma += 1
if len(line.split("\t")) == 2:
tab += 1
delimiter = "\t" if tab >= comma else ","
fails = []
question, answer = "", ""
i = 0
while i < len(lines):
arr = lines[i].split(delimiter)
if len(arr) != 2:
if question:
answer += "\n" + lines[i]
else:
fails.append(str(i+1))
elif len(arr) == 2:
if question and answer:
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
question, answer = arr
i += 1
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
if question:
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
delimiter = "\t" if any("\t" in line for line in lines) else ","
fails = []
question, answer = "", ""
res = []
reader = csv.reader(lines, delimiter=delimiter)
for i, row in enumerate(reader):
if len(row) != 2:
if question:
answer += "\n" + lines[i]
else:
fails.append(str(i + 1))
elif len(row) == 2:
if question and answer:
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
question, answer = row
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
if question:
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(list(reader))))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
pdf_parser = Pdf()
qai_list, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
for q, a, image, poss in qai_list:
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
return res
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
_last_question, last_answer = "", ""
question_stack, level_stack = [], []
code_block = False
for index, line in enumerate(lines):
if line.strip().startswith('```'):
code_block = not code_block
question_level, question = 0, ''
if not code_block:
question_level, question = mdQuestionLevel(line)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{line}'
else: # is a question
if last_answer.strip():
sum_question = '\n'.join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
last_answer = ''
i = question_level
while question_stack and i <= level_stack[-1]:
question_stack.pop()
level_stack.pop()
question_stack.append(question)
level_stack.append(question_level)
if last_answer.strip():
sum_question = '\n'.join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
return res
elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx()
qai_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
res = tokenize_table(tbls, doc, eng)
for i, (q, a, image) in enumerate(qai_list):
res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
return res
raise NotImplementedError(
"Excel, csv(txt), pdf, markdown and docx format files are supported.")
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

176
rag/app/resume.py Normal file
View File

@@ -0,0 +1,176 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import base64
import datetime
import json
import re
import pandas as pd
import requests
from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two
from rag.utils import rmSpace
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
]
def remote_call(filename, binary):
q = {
"header": {
"uid": 1,
"user": "kevinhu",
"log_id": filename
},
"request": {
"p": {
"request_id": "1",
"encrypt_type": "base64",
"filename": filename,
"langtype": '',
"fileori": base64.b64encode(binary).decode('utf-8')
},
"c": "resume_parse_module",
"m": "resume_parse"
}
}
for _ in range(3):
try:
resume = requests.post(
"http://127.0.0.1:61670/tog",
data=json.dumps(q))
resume = resume.json()["response"]["results"]
resume = refactor(resume)
for k in ["education", "work", "project",
"training", "skill", "certificate", "language"]:
if not resume.get(k) and k in resume:
del resume[k]
resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
resume = step_two.parse(resume)
return resume
except Exception:
logging.exception("Resume parser has not been supported yet!")
return {}
def chunk(filename, binary=None, callback=None, **kwargs):
"""
The supported file formats are pdf, docx and txt.
To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow
"""
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
raise NotImplementedError("file type not supported yet(pdf supported)")
if not binary:
with open(filename, "rb") as f:
binary = f.read()
callback(0.2, "Resume parsing is going on...")
resume = remote_call(filename, binary)
if len(resume.keys()) < 7:
callback(-1, "Resume is not successfully parsed.")
raise Exception("Resume parser remote call fail!")
callback(0.6, "Done parsing. Chunking...")
logging.debug("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
field_map = {
"name_kwd": "姓名/名字",
"name_pinyin_kwd": "姓名拼音/名字拼音",
"gender_kwd": "性别(男,女)",
"age_int": "年龄/岁/年纪",
"phone_kwd": "电话/手机/微信",
"email_tks": "email/e-mail/邮箱",
"position_name_tks": "职位/职能/岗位/职责",
"expect_city_names_tks": "期望城市",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"first_school_name_tks": "第一学历毕业学校",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"highest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"edu_first_fea_kwd": "第一学历标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"degree_kwd": "过往学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"major_tks": "学过的专业/过往专业",
"school_name_tks": "学校/毕业院校",
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
"edu_fea_kwd": "教育标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"edu_end_int": "毕业年份",
"industry_name_tks": "所在行业",
"birth_dt": "生日/出生年份",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
}
titles = []
for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
v = resume.get(n, "")
if isinstance(v, list):
v = v[0]
if n.find("tks") > 0:
v = rmSpace(v)
titles.append(str(v))
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pairs = []
for n, m in field_map.items():
if not resume.get(n):
continue
v = resume[n]
if isinstance(v, list):
v = " ".join(v)
if n.find("tks") > 0:
v = rmSpace(v)
pairs.append((m, str(v)))
doc["content_with_weight"] = "\n".join(
["{}: {}".format(re.sub(r"[^]+", "", k), v) for k, v in pairs])
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
for n, _ in field_map.items():
if n not in resume:
continue
if isinstance(resume[n], list) and (
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0]
if n.find("_tks") > 0:
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
doc[n] = resume[n]
logging.debug("chunked resume to " + str(doc))
KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": field_map})
return [doc]
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

402
rag/app/table.py Normal file
View File

@@ -0,0 +1,402 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
from io import BytesIO
from xpinyin import Pinyin
import numpy as np
import pandas as pd
from collections import Counter
# from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService
from deepdoc.parser.utils import get_text
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import ExcelParser
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
if not binary:
wb = Excel._load_excel_to_workbook(fnm)
else:
wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows))
res, fails, done = [], [], 0
rn = 0
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
headers, header_rows = self._parse_headers(ws, rows)
if not headers:
continue
data = []
for i, r in enumerate(rows[header_rows:]):
rn += 1
if rn - 1 < from_page:
continue
if rn - 1 >= to_page:
break
row_data = self._extract_row_data(ws, r, header_rows + i, len(headers))
if row_data is None:
fails.append(str(i))
continue
if self._is_empty_row(row_data):
continue
data.append(row_data)
done += 1
if len(data) == 0:
continue
df = pd.DataFrame(data, columns=headers)
res.append(df)
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
def _parse_headers(self, ws, rows):
if len(rows) == 0:
return [], 0
has_complex_structure = self._has_complex_header_structure(ws, rows)
if has_complex_structure:
return self._parse_multi_level_headers(ws, rows)
else:
return self._parse_simple_headers(rows)
def _has_complex_header_structure(self, ws, rows):
if len(rows) < 1:
return False
merged_ranges = list(ws.merged_cells.ranges)
# 检查前两行是否涉及合并单元格
for rng in merged_ranges:
if rng.min_row <= 2: # 只要合并区域涉及第1或第2行
return True
return False
def _row_looks_like_header(self, row):
header_like_cells = 0
data_like_cells = 0
non_empty_cells = 0
for cell in row:
if cell.value is not None:
non_empty_cells += 1
val = str(cell.value).strip()
if self._looks_like_header(val):
header_like_cells += 1
elif self._looks_like_data(val):
data_like_cells += 1
if non_empty_cells == 0:
return False
return header_like_cells >= data_like_cells
def _parse_simple_headers(self, rows):
if not rows:
return [], 0
header_row = rows[0]
headers = []
for cell in header_row:
if cell.value is not None:
header_value = str(cell.value).strip()
if header_value:
headers.append(header_value)
else:
pass
final_headers = []
for i, cell in enumerate(header_row):
if cell.value is not None:
header_value = str(cell.value).strip()
if header_value:
final_headers.append(header_value)
else:
final_headers.append(f"Column_{i + 1}")
else:
final_headers.append(f"Column_{i + 1}")
return final_headers, 1
def _parse_multi_level_headers(self, ws, rows):
if len(rows) < 2:
return [], 0
header_rows = self._detect_header_rows(rows)
if header_rows == 1:
return self._parse_simple_headers(rows)
else:
return self._build_hierarchical_headers(ws, rows, header_rows), header_rows
def _detect_header_rows(self, rows):
if len(rows) < 2:
return 1
header_rows = 1
max_check_rows = min(5, len(rows))
for i in range(1, max_check_rows):
row = rows[i]
if self._row_looks_like_header(row):
header_rows = i + 1
else:
break
return header_rows
def _looks_like_header(self, value):
if len(value) < 1:
return False
if any(ord(c) > 127 for c in value):
return True
if len([c for c in value if c.isalpha()]) >= 2:
return True
if any(c in value for c in ["(", ")", "", ":", "", "", "_", "-"]):
return True
return False
def _looks_like_data(self, value):
if len(value) == 1 and value.upper() in ["Y", "N", "M", "X", "/", "-"]:
return True
if value.replace(".", "").replace("-", "").replace(",", "").isdigit():
return True
if value.startswith("0x") and len(value) <= 10:
return True
return False
def _build_hierarchical_headers(self, ws, rows, header_rows):
headers = []
max_col = max(len(row) for row in rows[:header_rows]) if header_rows > 0 else 0
merged_ranges = list(ws.merged_cells.ranges)
for col_idx in range(max_col):
header_parts = []
for row_idx in range(header_rows):
if col_idx < len(rows[row_idx]):
cell_value = rows[row_idx][col_idx].value
merged_value = self._get_merged_cell_value(ws, row_idx + 1, col_idx + 1, merged_ranges)
if merged_value is not None:
cell_value = merged_value
if cell_value is not None:
cell_value = str(cell_value).strip()
if cell_value and cell_value not in header_parts and self._is_valid_header_part(cell_value):
header_parts.append(cell_value)
if header_parts:
header = "-".join(header_parts)
headers.append(header)
else:
headers.append(f"Column_{col_idx + 1}")
final_headers = [h for h in headers if h and h != "-"]
return final_headers
def _is_valid_header_part(self, value):
if len(value) == 1 and value.upper() in ["Y", "N", "M", "X"]:
return False
if value.replace(".", "").replace("-", "").replace(",", "").isdigit():
return False
if value in ["/", "-", "+", "*", "="]:
return False
return True
def _get_merged_cell_value(self, ws, row, col, merged_ranges):
for merged_range in merged_ranges:
if merged_range.min_row <= row <= merged_range.max_row and merged_range.min_col <= col <= merged_range.max_col:
return ws.cell(merged_range.min_row, merged_range.min_col).value
return None
def _extract_row_data(self, ws, row, absolute_row_idx, expected_cols):
row_data = []
merged_ranges = list(ws.merged_cells.ranges)
actual_row_num = absolute_row_idx + 1
for col_idx in range(expected_cols):
cell_value = None
actual_col_num = col_idx + 1
try:
cell_value = ws.cell(row=actual_row_num, column=actual_col_num).value
except ValueError:
if col_idx < len(row):
cell_value = row[col_idx].value
if cell_value is None:
merged_value = self._get_merged_cell_value(ws, actual_row_num, actual_col_num, merged_ranges)
if merged_value is not None:
cell_value = merged_value
else:
cell_value = self._get_inherited_value(ws, actual_row_num, actual_col_num, merged_ranges)
row_data.append(cell_value)
return row_data
def _get_inherited_value(self, ws, row, col, merged_ranges):
for merged_range in merged_ranges:
if merged_range.min_row <= row <= merged_range.max_row and merged_range.min_col <= col <= merged_range.max_col:
return ws.cell(merged_range.min_row, merged_range.min_col).value
return None
def _is_empty_row(self, row_data):
for val in row_data:
if val is not None and str(val).strip() != "":
return False
return True
def trans_datatime(s):
try:
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
except Exception:
pass
def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no"
def column_data_type(arr):
arr = list(arr)
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
float_flag = False
for a in arr:
if a is None:
continue
if re.match(r"[+-]?[0-9]+$", str(a).replace("%%", "")) and not str(a).replace("%%", "").startswith("0"):
counts["int"] += 1
if int(str(a)) > 2**63 - 1:
float_flag = True
break
elif re.match(r"[+-]?[0-9.]{,19}$", str(a).replace("%%", "")) and not str(a).replace("%%", "").startswith("0"):
counts["float"] += 1
elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
counts["bool"] += 1
elif trans_datatime(str(a)):
counts["datetime"] += 1
else:
counts["text"] += 1
if float_flag:
ty = "float"
else:
counts = sorted(counts.items(), key=lambda x: x[1] * -1)
ty = counts[0][0]
for i in range(len(arr)):
if arr[i] is None:
continue
try:
arr[i] = trans[ty](str(arr[i]))
except Exception:
arr[i] = None
# if ty == "text":
# if len(arr) > 128 and uni / len(arr) < 0.1:
# ty = "keyword"
return arr, ty
def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA
Every row in table will be treated as a chunk.
"""
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
fails = []
headers = lines[0].split(kwargs.get("delimiter", "\t"))
rows = []
for i, line in enumerate(lines[1:]):
if i < from_page:
continue
if i >= to_page:
break
row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
if len(row) != len(headers):
fails.append(str(i))
continue
rows.append(row)
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
dfs = [pd.DataFrame(np.array(rows), columns=headers)]
else:
raise NotImplementedError("file type not supported yet(excel, text, csv supported)")
res = []
PY = Pinyin()
fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
for df in dfs:
for n in ["id", "_id", "index", "idx"]:
if n in df.columns:
del df[n]
clmns = df.columns.values
if len(clmns) != len(set(clmns)):
col_counts = Counter(clmns)
duplicates = [col for col, count in col_counts.items() if count > 1]
if duplicates:
raise ValueError(f"Duplicate column names detected: {duplicates}\nFrom: {clmns}")
txts = list(copy.deepcopy(clmns))
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|[^]+?|\([^()]+?\))", "", str(n)), "_")[0] for n in clmns]
clmn_tys = []
for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]])
clmn_tys.append(ty)
df[clmns[j]] = cln
if ty == "text":
txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))]
eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows():
d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
row_txt = []
for j in range(len(clmns)):
if row[clmns[j]] is None:
continue
if not str(row[clmns[j]]):
continue
if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
continue
fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt:
continue
tokenize(d, "; ".join(row_txt), eng)
res.append(d)
KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
callback(0.35, "")
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

157
rag/app/tag.py Normal file
View File

@@ -0,0 +1,157 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import re
import csv
from copy import deepcopy
from deepdoc.parser.utils import get_text
from rag.app.qa import Excel
from rag.nlp import rag_tokenizer
def beAdoc(d, q, a, eng, row_num=-1):
d["content_with_weight"] = q
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["tag_kwd"] = [t.strip().replace(".", "_") for t in a.split(",") if t.strip()]
if row_num >= 0:
d["top_int"] = [row_num]
return d
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column content and tags without header.
And content column is ahead of tags column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate content and tags.
All the deformed lines will be ignored.
Every pair will be treated as a chunk.
"""
eng = lang.lower() == "english"
res = []
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
return res
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
comma, tab = 0, 0
for line in lines:
if len(line.split(",")) == 2:
comma += 1
if len(line.split("\t")) == 2:
tab += 1
delimiter = "\t" if tab >= comma else ","
fails = []
content = ""
i = 0
while i < len(lines):
arr = lines[i].split(delimiter)
if len(arr) != 2:
content += "\n" + lines[i]
elif len(arr) == 2:
content += "\n" + arr[0]
res.append(beAdoc(deepcopy(doc), content, arr[1], eng, i))
content = ""
i += 1
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract TAG: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract TAG: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
fails = []
content = ""
res = []
reader = csv.reader(lines)
for i, row in enumerate(reader):
row = [r.strip() for r in row if r.strip()]
if len(row) != 2:
content += "\n" + lines[i]
elif len(row) == 2:
content += "\n" + row[0]
res.append(beAdoc(deepcopy(doc), content, row[1], eng, i))
content = ""
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Tags: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.6, ("Extract TAG : {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
raise NotImplementedError(
"Excel, csv(txt) format files are supported.")
def label_question(question, kbs):
from api.db.services.knowledgebase_service import KnowledgebaseService
from graphrag.utils import get_tags_from_cache, set_tags_to_cache
from api import settings
tags = None
tag_kb_ids = []
for kb in kbs:
if kb.parser_config.get("tag_kb_ids"):
tag_kb_ids.extend(kb.parser_config["tag_kb_ids"])
if tag_kb_ids:
all_tags = get_tags_from_cache(tag_kb_ids)
if not all_tags:
all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids)
set_tags_to_cache(tags=all_tags, kb_ids=tag_kb_ids)
else:
all_tags = json.loads(all_tags)
tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids)
if not tag_kbs:
return tags
tags = settings.retrievaler.tag_query(question,
list(set([kb.tenant_id for kb in tag_kbs])),
tag_kb_ids,
all_tags,
kb.parser_config.get("topn_tags", 3)
)
return tags
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)