Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -1,6 +1,8 @@
"""文档解析服务"""
"""Initialize the app.services.parser package."""
from .pdf_parser import PDFParser
from .docx_parser import DocxParser
# Keep package boundaries explicit so backend imports stay predictable.
__all__ = ["PDFParser", "DocxParser"]

View File

@@ -1,4 +1,4 @@
"""Word文档解析 - 使用python-docx"""
"""Provide service-layer logic for docx parser."""
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
@@ -6,27 +6,29 @@ from typing import List, Dict, Optional
from dataclasses import dataclass, field
from loguru import logger
import re
# Keep service responsibilities explicit so downstream behavior stays predictable.
@dataclass
class DocxParagraph:
"""段落内容"""
"""Represent the Docx Paragraph type."""
text: str
level: int = 0 # 标题级别0表示正文
level: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
is_list: bool = False
list_number: Optional[str] = None
@dataclass
class DocxTable:
"""表格内容"""
"""Represent the Docx Table type."""
rows: List[List[str]]
markdown: str = ""
@dataclass
class DocxDocumentContent:
"""Word文档完整内容"""
"""Represent the Docx Document Content type."""
file_path: str
paragraphs: List[DocxParagraph]
tables: List[DocxTable]
@@ -35,21 +37,14 @@ class DocxDocumentContent:
class DocxParser:
"""Word文档解析器 - 基于python-docx"""
"""Provide the Docx Parser parser."""
def __init__(self):
"""Initialize the Docx Parser instance."""
self.document = None
def parse(self, file_path: str) -> DocxDocumentContent:
"""
解析Word文档
Args:
file_path: Word文档路径
Returns:
DocxDocumentContent: 解析后的文档内容
"""
"""Handle parse for the Docx Parser instance."""
logger.info(f"开始解析Word文档: {file_path}")
try:
@@ -60,16 +55,16 @@ class DocxParser:
tables=[]
)
# 提取文档元数据
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.metadata = self._extract_metadata()
# 提取段落
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.paragraphs = self._extract_paragraphs()
# 提取表格
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.tables = self._extract_tables()
# 生成Markdown格式文本
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.markdown_text = self._generate_markdown(doc_content)
logger.success(f"Word文档解析完成{len(doc_content.paragraphs)}个段落")
@@ -81,7 +76,7 @@ class DocxParser:
raise
def _extract_metadata(self) -> Dict[str, str]:
"""提取文档元数据"""
"""Handle extract metadata for this module for the Docx Parser instance."""
metadata = {}
try:
core_props = self.document.core_properties
@@ -98,7 +93,7 @@ class DocxParser:
return metadata
def _extract_paragraphs(self) -> List[DocxParagraph]:
"""提取所有段落"""
"""Handle extract paragraphs for this module for the Docx Parser instance."""
paragraphs = []
for para in self.document.paragraphs:
@@ -106,10 +101,10 @@ class DocxParser:
if not text:
continue
# 判断标题级别
# Keep service responsibilities explicit so downstream behavior stays predictable.
level = self._get_paragraph_level(para)
# 判断是否是列表项
# Keep service responsibilities explicit so downstream behavior stays predictable.
is_list, list_number = self._detect_list_item(para)
paragraph = DocxParagraph(
@@ -123,66 +118,61 @@ class DocxParser:
return paragraphs
def _get_paragraph_level(self, para) -> int:
"""
判断段落标题级别
Returns:
int: 标题级别0表示正文
"""
# 方法1检查段落样式
"""Handle get paragraph level for this module for the Docx Parser instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
style_name = para.style.name if para.style else ""
if "Heading" in style_name or "标题" in style_name:
# 从样式名称中提取级别
# Keep service responsibilities explicit so downstream behavior stays predictable.
match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
if match:
level = int(match.group(1) or match.group(2))
return level
# 方法2检查段落格式字号
# 标题通常字号较大
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
if para.paragraph_format:
# 可以根据字号判断,这里简化处理
# Keep service responsibilities explicit so downstream behavior stays predictable.
pass
# 方法3根据内容模式判断法规文档特征
# Keep service responsibilities explicit so downstream behavior stays predictable.
text = para.text.strip()
# 第一章、第X章 -> 二级标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
return 2
# 第X节 -> 三级标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
return 3
# 第X条 -> 四级标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
return 4
return 0 # 正文
return 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
"""检测是否是列表项"""
"""Handle detect list item for this module for the Docx Parser instance."""
text = para.text.strip()
# 数字列表1.、2.、1、[1]等
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^[\d]+[.、)\]]\s', text):
match = re.match(r'^([\d]+[.、)\]])\s', text)
return True, match.group(1) if match else None
# 中文数字列表:一、二、(一)等
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
return True, match.group(1) if match else None
# 检查段落格式中的列表编号
# Keep service responsibilities explicit so downstream behavior stays predictable.
if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
# 有缩进的可能是列表项
# Keep service responsibilities explicit so downstream behavior stays predictable.
pass
return False, None
def _extract_tables(self) -> List[DocxTable]:
"""提取所有表格"""
"""Handle extract tables for this module for the Docx Parser instance."""
tables = []
for table in self.document.tables:
@@ -193,7 +183,7 @@ class DocxParser:
cells.append(cell.text.strip())
rows.append(cells)
# 转换为Markdown表格
# Keep service responsibilities explicit so downstream behavior stays predictable.
markdown = self._table_to_markdown(rows)
table_content = DocxTable(rows=rows, markdown=markdown)
@@ -202,34 +192,34 @@ class DocxParser:
return tables
def _table_to_markdown(self, rows: List[List[str]]) -> str:
"""将表格转换为Markdown格式"""
"""Handle table to markdown for this module for the Docx Parser instance."""
if not rows or len(rows) < 1:
return ""
lines = []
# 表头
# Keep service responsibilities explicit so downstream behavior stays predictable.
if len(rows) >= 1:
header = rows[0]
lines.append("| " + " | ".join(cell for cell in header) + " |")
lines.append("| " + " | ".join("---" for _ in header) + " |")
# 数据行
# Keep service responsibilities explicit so downstream behavior stays predictable.
for row in rows[1:]:
lines.append("| " + " | ".join(cell for cell in row) + " |")
return "\n".join(lines)
def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
"""生成Markdown格式文本"""
"""Handle generate markdown for this module for the Docx Parser instance."""
lines = []
# 文档标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
title = doc_content.metadata.get("title", "")
if title:
lines.append(f"# {title}\n")
else:
# 从第一个段落获取标题(如果是标题样式)
# Keep service responsibilities explicit so downstream behavior stays predictable.
for para in doc_content.paragraphs[:5]:
if para.level == 1:
lines.append(f"# {para.text}\n")
@@ -237,29 +227,29 @@ class DocxParser:
else:
lines.append(f"# {doc_content.file_path}\n")
# 元数据信息
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append("\n## 文档信息\n")
for key, value in doc_content.metadata.items():
if value:
lines.append(f"- **{key}**: {value}")
# 正文内容
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append("\n## 正文\n")
table_index = 0
for para in doc_content.paragraphs:
if para.level > 0:
# 标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
prefix = "#" * para.level
lines.append(f"\n{prefix} {para.text}\n")
elif para.is_list:
# 列表项
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append(f"- {para.text}")
else:
# 正文
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append(para.text)
# 添加表格
# Keep service responsibilities explicit so downstream behavior stays predictable.
if doc_content.tables:
lines.append("\n## 表格\n")
for i, table in enumerate(doc_content.tables):
@@ -269,18 +259,18 @@ class DocxParser:
return "\n".join(lines)
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
"""Parse to markdown for the Docx Parser instance."""
doc_content = self.parse(file_path)
return doc_content.markdown_text
def parse_docx(file_path: str) -> DocxDocumentContent:
"""便捷函数解析Word文档"""
"""Parse docx."""
parser = DocxParser()
return parser.parse(file_path)
def parse_docx_to_markdown(file_path: str) -> str:
"""便捷函数解析Word并返回Markdown"""
"""Parse docx to markdown."""
parser = DocxParser()
return parser.parse_to_markdown(file_path)

View File

@@ -1,14 +1,16 @@
"""MinerU多模态PDF解析 - 版面感知解析"""
"""Provide service-layer logic for mineru parser."""
from typing import Optional, Dict
from dataclasses import dataclass, field
from loguru import logger
import os
# Keep service responsibilities explicit so downstream behavior stays predictable.
@dataclass
class MinerUResult:
"""MinerU解析结果"""
"""Represent the Miner U Result type."""
file_path: str
markdown_text: str
metadata: Dict[str, str] = field(default_factory=dict)
@@ -17,21 +19,14 @@ class MinerUResult:
class MinerUParser:
"""
MinerU多模态PDF解析器
MinerU (magic-pdf) 是一个开源的高质量PDF解析工具
支持版面感知解析,能够识别文档中的标题、正文、表格、图片等元素,
并输出结构化的Markdown格式。
GitHub: https://github.com/opendatalab/MinerU
"""
"""Provide the Miner U Parser parser."""
def __init__(self):
"""Initialize the Miner U Parser instance."""
self.available = self._check_mineru_available()
def _check_mineru_available(self) -> bool:
"""检查MinerU是否可用"""
"""Handle check mineru available for this module for the Miner U Parser instance."""
try:
from magic_pdf.pipe.UNIPipe import UNIPipe
return True
@@ -40,16 +35,7 @@ class MinerUParser:
return False
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
"""
使用MinerU解析PDF文档
Args:
file_path: PDF文件路径
output_dir: 输出目录(可选,用于保存解析产物)
Returns:
MinerUResult: 解析结果
"""
"""Handle parse for the Miner U Parser instance."""
logger.info(f"尝试使用MinerU解析: {file_path}")
if not self.available:
@@ -64,19 +50,19 @@ class MinerUParser:
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.libs.MakeContentConfig import DropMode
# 设置输出目录
# Keep service responsibilities explicit so downstream behavior stays predictable.
if output_dir is None:
output_dir = os.path.dirname(file_path)
# 创建解析管道
# OCR模式可以根据PDF类型选择
# auto: 自动判断是否需要OCR
# txt: 纯文本PDF无OCR
# ocr: 扫描件PDFOCR
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
pipe = UNIPipe(file_path, output_dir)
# 执行解析
# pipe_mk() 返回Markdown格式文本
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
markdown_content = pipe.pipe_mk()
logger.success(f"MinerU解析成功")
@@ -98,13 +84,13 @@ class MinerUParser:
)
def _extract_metadata(self, pipe) -> Dict[str, str]:
"""从解析管道提取元数据"""
"""Handle extract metadata for this module for the Miner U Parser instance."""
metadata = {}
try:
# MinerU解析管道中可能包含的元数据信息
# Keep service responsibilities explicit so downstream behavior stays predictable.
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
mid_data = pipe.pdf_mid_data
# 提取可能的元数据字段
# Keep service responsibilities explicit so downstream behavior stays predictable.
metadata = {
"page_count": str(mid_data.get("page_count", "")),
"language": str(mid_data.get("language", "")),
@@ -116,41 +102,27 @@ class MinerUParser:
return metadata
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
"""Parse to markdown for the Miner U Parser instance."""
result = self.parse(file_path)
return result.markdown_text if result.success else ""
class ParserOrchestrator:
"""
解析服务编排 - 按优先级选择解析器
解析策略:
1. 优先尝试MinerU版面感知能力强
2. MinerU失败时回退到基础PyMuPDF解析
"""
"""Represent the Parser Orchestrator type."""
def __init__(self):
"""Initialize the Parser Orchestrator instance."""
from .pdf_parser import PDFParser
self.mineru_parser = MinerUParser()
self.pdf_parser = PDFParser()
self.mineru_available = self.mineru_parser.available
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
"""
解析PDF文档按优先级选择解析器
Args:
file_path: PDF文件路径
prefer_mineru: 是否优先使用MinerU
Returns:
str: Markdown格式文本
"""
"""Parse pdf for the Parser Orchestrator instance."""
markdown_text = ""
if prefer_mineru and self.mineru_available:
# 优先尝试MinerU
# Keep service responsibilities explicit so downstream behavior stays predictable.
result = self.mineru_parser.parse(file_path)
if result.success:
markdown_text = result.markdown_text
@@ -159,28 +131,20 @@ class ParserOrchestrator:
else:
logger.warning(f"MinerU解析失败回退到PyMuPDF: {result.error_message}")
# 回退到PyMuPDF基础解析
# Keep service responsibilities explicit so downstream behavior stays predictable.
logger.info("使用PyMuPDF基础解析")
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
return markdown_text
def parse_docx(self, file_path: str) -> str:
"""解析Word文档"""
"""Parse docx for the Parser Orchestrator instance."""
from .docx_parser import DocxParser
docx_parser = DocxParser()
return docx_parser.parse_to_markdown(file_path)
def parse(self, file_path: str) -> str:
"""
根据文件类型选择解析器
Args:
file_path: 文件路径
Returns:
str: Markdown格式文本
"""
"""Handle parse for the Parser Orchestrator instance."""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
@@ -192,12 +156,12 @@ class ParserOrchestrator:
def parse_with_mineru(file_path: str) -> MinerUResult:
"""便捷函数使用MinerU解析"""
"""Parse with mineru."""
parser = MinerUParser()
return parser.parse(file_path)
def parse_pdf_smart(file_path: str) -> str:
"""便捷函数智能解析PDF自动选择最佳解析器"""
"""Parse pdf smart."""
orchestrator = ParserOrchestrator()
return orchestrator.parse_pdf(file_path)

View File

@@ -1,4 +1,4 @@
"""PDF文档解析 - 使用PyMuPDF基础解析"""
"""Provide service-layer logic for pdf parser."""
import fitz # PyMuPDF
from typing import List, Dict, Optional, Tuple
@@ -9,17 +9,17 @@ import re
@dataclass
class PDFPageContent:
"""PDF页面内容"""
"""Represent the P D F Page Content type."""
page_number: int
text: str
tables: List[str] = field(default_factory=list)
images: List[str] = field(default_factory=list) # 图片路径列表
images: List[str] = field(default_factory=list) # Keep service responsibilities explicit so downstream behavior stays predictable.
blocks: List[Dict] = field(default_factory=list)
@dataclass
class PDFDocumentContent:
"""PDF文档完整内容"""
"""Represent the P D F Document Content type."""
file_path: str
total_pages: int
pages: List[PDFPageContent]
@@ -28,23 +28,14 @@ class PDFDocumentContent:
class PDFParser:
"""PDF文档解析器 - 基于PyMuPDF"""
"""Provide the P D F Parser parser."""
def __init__(self):
"""Initialize the P D F Parser instance."""
self.pdf = None
def parse(self, file_path: str, extract_tables: bool = True, extract_images: bool = False) -> PDFDocumentContent:
"""
解析PDF文档
Args:
file_path: PDF文件路径
extract_tables: 是否提取表格
extract_images: 是否提取图片
Returns:
PDFDocumentContent: 解析后的文档内容
"""
"""Handle parse for the P D F Parser instance."""
logger.info(f"开始解析PDF文档: {file_path}")
try:
@@ -55,16 +46,16 @@ class PDFParser:
pages=[]
)
# 提取文档元数据
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.metadata = self._extract_metadata()
# 逐页解析
# Keep service responsibilities explicit so downstream behavior stays predictable.
for page_num in range(self.pdf.page_count):
page = self.pdf[page_num]
page_content = self._parse_page(page, page_num + 1, extract_tables, extract_images)
doc_content.pages.append(page_content)
# 生成Markdown格式文本
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.markdown_text = self._generate_markdown(doc_content)
self.pdf.close()
@@ -77,7 +68,7 @@ class PDFParser:
raise
def _extract_metadata(self) -> Dict[str, str]:
"""提取PDF元数据"""
"""Handle extract metadata for this module for the P D F Parser instance."""
metadata = {}
try:
meta = self.pdf.metadata
@@ -97,23 +88,23 @@ class PDFParser:
def _parse_page(self, page: fitz.Page, page_num: int,
extract_tables: bool, extract_images: bool) -> PDFPageContent:
"""解析单页内容"""
"""Handle parse page for this module for the P D F Parser instance."""
page_content = PDFPageContent(page_number=page_num, text="")
# 提取文本块(保留结构)
# Keep service responsibilities explicit so downstream behavior stays predictable.
blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
page_content.blocks = blocks
# 提取纯文本
# Keep service responsibilities explicit so downstream behavior stays predictable.
text = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE)
page_content.text = text.strip()
# 提取表格使用PyMuPDF的表格提取功能
# Keep service responsibilities explicit so downstream behavior stays predictable.
if extract_tables:
tables = self._extract_tables_from_page(page)
page_content.tables = tables
# 提取图片
# Keep service responsibilities explicit so downstream behavior stays predictable.
if extract_images:
images = self._extract_images_from_page(page, page_num)
page_content.images = images
@@ -121,25 +112,22 @@ class PDFParser:
return page_content
def _extract_tables_from_page(self, page: fitz.Page) -> List[str]:
"""
从页面提取表格(基于文本块分析)
注意PyMuPDF基础版表格提取能力有限复杂表格建议使用MinerU
"""
"""Handle extract tables from page for this module for the P D F Parser instance."""
tables = []
try:
# 使用PyMuPDF的表格提取方法2.4+版本)
# 对于更复杂的表格需要在mineru_parser中使用更高级的方法
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
tabs = page.find_tables()
if tabs:
for tab in tabs:
table_text = tab.extract()
# 将表格转换为Markdown格式
# Keep service responsibilities explicit so downstream behavior stays predictable.
markdown_table = self._table_to_markdown(table_text)
tables.append(markdown_table)
except AttributeError:
# 旧版本PyMuPDF没有表格提取功能
# Keep service responsibilities explicit so downstream behavior stays predictable.
logger.warning("PyMuPDF版本不支持表格提取请升级到2.4+版本")
except Exception as e:
logger.warning(f"表格提取失败: {e}")
@@ -147,28 +135,28 @@ class PDFParser:
return tables
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
"""将表格数据转换为Markdown格式"""
"""Handle table to markdown for this module for the P D F Parser instance."""
if not table_data or len(table_data) < 1:
return ""
lines = []
# 表头
# Keep service responsibilities explicit so downstream behavior stays predictable.
if len(table_data) >= 1:
header = table_data[0]
lines.append("| " + " | ".join(str(cell).strip() for cell in header) + " |")
lines.append("| " + " | ".join("---" for _ in header) + " |")
# 数据行
# Keep service responsibilities explicit so downstream behavior stays predictable.
for row in table_data[1:]:
lines.append("| " + " | ".join(str(cell).strip() for cell in row) + " |")
return "\n".join(lines)
def _extract_images_from_page(self, page: fitz.Page, page_num: int) -> List[str]:
"""提取页面图片"""
"""Handle extract images from page for this module for the P D F Parser instance."""
images = []
# 图片提取功能(可选实现)
# 这里仅记录图片信息,实际图片需要额外保存
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
try:
image_list = page.get_images()
for img_index, img in enumerate(image_list):
@@ -179,52 +167,52 @@ class PDFParser:
return images
def _generate_markdown(self, doc_content: PDFDocumentContent) -> str:
"""生成Markdown格式文本"""
"""Handle generate markdown for this module for the P D F Parser instance."""
lines = []
# 文档标题
# Keep service responsibilities explicit so downstream behavior stays predictable.
title = doc_content.metadata.get("title", "")
if title:
lines.append(f"# {title}\n")
else:
lines.append(f"# {doc_content.file_path}\n")
# 元数据信息
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append("\n## 文档信息\n")
for key, value in doc_content.metadata.items():
if value and key in ["author", "subject", "keywords", "creation_date"]:
lines.append(f"- **{key}**: {value}")
# 正文内容
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append("\n## 正文\n")
for page in doc_content.pages:
# 页码标记
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append(f"\n---\n**第 {page.page_number} 页**\n")
# 处理文本内容,识别标题结构
# Keep service responsibilities explicit so downstream behavior stays predictable.
text = self._process_page_text(page.text, page.blocks)
lines.append(text)
# 添加表格
# Keep service responsibilities explicit so downstream behavior stays predictable.
for table in page.tables:
lines.append("\n" + table + "\n")
return "\n".join(lines)
def _process_page_text(self, text: str, blocks: List[Dict]) -> str:
"""处理页面文本,识别标题结构"""
# 基于字体大小识别标题
"""Handle process page text for this module for the P D F Parser instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
processed_text = text
# 尝试识别标题(基于字号)
# 法规文档通常有明确的层级结构:章、节、条
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
processed_text = self._detect_headers(text, blocks)
return processed_text
def _detect_headers(self, text: str, blocks: List[Dict]) -> str:
"""检测并标记标题(基于字号或内容模式)"""
"""Handle detect headers for this module for the P D F Parser instance."""
lines = text.split("\n")
processed_lines = []
@@ -233,8 +221,8 @@ class PDFParser:
if not line:
continue
# 法规标题模式检测
# 第一章、第X章、第X节、第X条等
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^第[一二三四五六七八九十百]+章\s', line):
processed_lines.append(f"\n## {line}\n")
elif re.match(r'^第[一二三四五六七八九十百]+节\s', line):
@@ -242,7 +230,7 @@ class PDFParser:
elif re.match(r'^第[一二三四五六七八九十百]+条\s', line):
processed_lines.append(f"\n#### {line}\n")
elif re.match(r'^[一二三四五六七八九十]+\s*[、.]', line):
# 条款子项
# Keep service responsibilities explicit so downstream behavior stays predictable.
processed_lines.append(f"- {line}")
else:
processed_lines.append(line)
@@ -250,18 +238,18 @@ class PDFParser:
return "\n".join(processed_lines)
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
"""Parse to markdown for the P D F Parser instance."""
doc_content = self.parse(file_path)
return doc_content.markdown_text
def parse_pdf(file_path: str, **kwargs) -> PDFDocumentContent:
"""便捷函数解析PDF文档"""
"""Parse pdf."""
parser = PDFParser()
return parser.parse(file_path, **kwargs)
def parse_pdf_to_markdown(file_path: str) -> str:
"""便捷函数解析PDF并返回Markdown"""
"""Parse pdf to markdown."""
parser = PDFParser()
return parser.parse_to_markdown(file_path)