将flask改成fastapi
This commit is contained in:
273
deepdoc/parser/markdown_parser.py
Normal file
273
deepdoc/parser/markdown_parser.py
Normal file
@@ -0,0 +1,273 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re
|
||||
|
||||
import mistune
|
||||
from markdown import markdown
|
||||
|
||||
|
||||
class RAGFlowMarkdownParser:
|
||||
def __init__(self, chunk_token_num=128):
|
||||
self.chunk_token_num = int(chunk_token_num)
|
||||
|
||||
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
|
||||
tables = []
|
||||
working_text = markdown_text
|
||||
|
||||
def replace_tables_with_rendered_html(pattern, table_list, render=True):
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
table_list.append(raw_table)
|
||||
if separate_tables:
|
||||
# Skip this match (i.e., remove it)
|
||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||
else:
|
||||
# Replace with rendered HTML
|
||||
html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
|
||||
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
return new_text
|
||||
|
||||
if "|" in markdown_text: # for optimize performance
|
||||
# Standard Markdown table
|
||||
border_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
(?:\|.*?\|.*?\|.*?\n)
|
||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||
(?:\|.*?\|.*?\|.*?\n)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
||||
|
||||
# Borderless Markdown table
|
||||
no_border_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
(?:\S.*?\|.*?\n)
|
||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||
(?:\S.*?\|.*?\n)+
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
||||
|
||||
if "<table>" in working_text.lower(): # for optimize performance
|
||||
# HTML table extraction - handle possible html/body wrapper tags
|
||||
html_table_pattern = re.compile(
|
||||
r"""
|
||||
(?:\n|^)
|
||||
\s*
|
||||
(?:
|
||||
# case1: <html><body><table>...</table></body></html>
|
||||
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
||||
|
|
||||
# case2: <body><table>...</table></body>
|
||||
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
||||
|
|
||||
# case3: only<table>...</table>
|
||||
(?:<table[^>]*>.*?</table>)
|
||||
)
|
||||
\s*
|
||||
(?=\n|$)
|
||||
""",
|
||||
re.VERBOSE | re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
def replace_html_tables():
|
||||
nonlocal working_text
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in html_table_pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
tables.append(raw_table)
|
||||
if separate_tables:
|
||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||
else:
|
||||
new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
working_text = new_text
|
||||
|
||||
replace_html_tables()
|
||||
|
||||
return working_text, tables
|
||||
|
||||
|
||||
class MarkdownElementExtractor:
|
||||
def __init__(self, markdown_content):
|
||||
self.markdown_content = markdown_content
|
||||
self.lines = markdown_content.split("\n")
|
||||
self.ast_parser = mistune.create_markdown(renderer="ast")
|
||||
self.ast_nodes = self.ast_parser(markdown_content)
|
||||
|
||||
def extract_elements(self):
|
||||
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
||||
sections = []
|
||||
|
||||
i = 0
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
|
||||
if re.match(r"^#{1,6}\s+.*$", line):
|
||||
# header
|
||||
element = self._extract_header(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip().startswith("```"):
|
||||
# code block
|
||||
element = self._extract_code_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
||||
# list block
|
||||
element = self._extract_list_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip().startswith(">"):
|
||||
# blockquote
|
||||
element = self._extract_blockquote(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
elif line.strip():
|
||||
# text block (paragraphs and inline elements until next block element)
|
||||
element = self._extract_text_block(i)
|
||||
sections.append(element["content"])
|
||||
i = element["end_line"] + 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
sections = [section for section in sections if section.strip()]
|
||||
return sections
|
||||
|
||||
def _extract_header(self, start_pos):
|
||||
return {
|
||||
"type": "header",
|
||||
"content": self.lines[start_pos],
|
||||
"start_line": start_pos,
|
||||
"end_line": start_pos,
|
||||
}
|
||||
|
||||
def _extract_code_block(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = [self.lines[start_pos]]
|
||||
|
||||
# Find the end of the code block
|
||||
for i in range(start_pos + 1, len(self.lines)):
|
||||
content_lines.append(self.lines[i])
|
||||
end_pos = i
|
||||
if self.lines[i].strip().startswith("```"):
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "code_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_list_block(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = []
|
||||
|
||||
i = start_pos
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
# check if this line is a list item or continuation of a list
|
||||
if (
|
||||
re.match(r"^\s*[-*+]\s+.*$", line)
|
||||
or re.match(r"^\s*\d+\.\s+.*$", line)
|
||||
or (i > start_pos and not line.strip())
|
||||
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
|
||||
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
|
||||
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
|
||||
):
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "list_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_blockquote(self, start_pos):
|
||||
end_pos = start_pos
|
||||
content_lines = []
|
||||
|
||||
i = start_pos
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return {
|
||||
"type": "blockquote",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
|
||||
def _extract_text_block(self, start_pos):
|
||||
"""Extract a text block (paragraphs, inline elements) until next block element"""
|
||||
end_pos = start_pos
|
||||
content_lines = [self.lines[start_pos]]
|
||||
|
||||
i = start_pos + 1
|
||||
while i < len(self.lines):
|
||||
line = self.lines[i]
|
||||
# stop if we encounter a block element
|
||||
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
|
||||
break
|
||||
elif not line.strip():
|
||||
# check if the next line is a block element
|
||||
if i + 1 < len(self.lines) and (
|
||||
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
|
||||
or self.lines[i + 1].strip().startswith("```")
|
||||
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
|
||||
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
|
||||
or self.lines[i + 1].strip().startswith(">")
|
||||
):
|
||||
break
|
||||
else:
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
else:
|
||||
content_lines.append(line)
|
||||
end_pos = i
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"type": "text_block",
|
||||
"content": "\n".join(content_lines),
|
||||
"start_line": start_pos,
|
||||
"end_line": end_pos,
|
||||
}
|
||||
Reference in New Issue
Block a user