2026-05-14 15:07:34 +08:00
|
|
|
|
"""PDF文档解析 - 使用PyMuPDF基础解析"""
|
|
|
|
|
|
|
|
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
from typing import List, Dict, Optional, Tuple
|
|
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class PDFPageContent:
|
|
|
|
|
|
"""PDF页面内容"""
|
|
|
|
|
|
page_number: int
|
|
|
|
|
|
text: str
|
|
|
|
|
|
tables: List[str] = field(default_factory=list)
|
|
|
|
|
|
images: List[str] = field(default_factory=list) # 图片路径列表
|
|
|
|
|
|
blocks: List[Dict] = field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class PDFDocumentContent:
|
|
|
|
|
|
"""PDF文档完整内容"""
|
|
|
|
|
|
file_path: str
|
|
|
|
|
|
total_pages: int
|
|
|
|
|
|
pages: List[PDFPageContent]
|
|
|
|
|
|
metadata: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
markdown_text: str = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFParser:
|
|
|
|
|
|
"""PDF文档解析器 - 基于PyMuPDF"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
self.pdf = None
|
|
|
|
|
|
|
|
|
|
|
|
def parse(self, file_path: str, extract_tables: bool = True, extract_images: bool = False) -> PDFDocumentContent:
|
|
|
|
|
|
"""
|
|
|
|
|
|
解析PDF文档
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
file_path: PDF文件路径
|
|
|
|
|
|
extract_tables: 是否提取表格
|
|
|
|
|
|
extract_images: 是否提取图片
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
PDFDocumentContent: 解析后的文档内容
|
|
|
|
|
|
"""
|
|
|
|
|
|
logger.info(f"开始解析PDF文档: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.pdf = fitz.open(file_path)
|
|
|
|
|
|
doc_content = PDFDocumentContent(
|
|
|
|
|
|
file_path=file_path,
|
|
|
|
|
|
total_pages=self.pdf.page_count,
|
|
|
|
|
|
pages=[]
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 提取文档元数据
|
|
|
|
|
|
doc_content.metadata = self._extract_metadata()
|
|
|
|
|
|
|
|
|
|
|
|
# 逐页解析
|
|
|
|
|
|
for page_num in range(self.pdf.page_count):
|
|
|
|
|
|
page = self.pdf[page_num]
|
|
|
|
|
|
page_content = self._parse_page(page, page_num + 1, extract_tables, extract_images)
|
|
|
|
|
|
doc_content.pages.append(page_content)
|
|
|
|
|
|
|
|
|
|
|
|
# 生成Markdown格式文本
|
|
|
|
|
|
doc_content.markdown_text = self._generate_markdown(doc_content)
|
|
|
|
|
|
|
|
|
|
|
|
self.pdf.close()
|
|
|
|
|
|
logger.success(f"PDF解析完成,共{doc_content.total_pages}页")
|
|
|
|
|
|
|
|
|
|
|
|
return doc_content
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"PDF解析失败: {e}")
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_metadata(self) -> Dict[str, str]:
|
|
|
|
|
|
"""提取PDF元数据"""
|
|
|
|
|
|
metadata = {}
|
|
|
|
|
|
try:
|
|
|
|
|
|
meta = self.pdf.metadata
|
|
|
|
|
|
metadata = {
|
|
|
|
|
|
"title": meta.get("title", ""),
|
|
|
|
|
|
"author": meta.get("author", ""),
|
|
|
|
|
|
"subject": meta.get("subject", ""),
|
|
|
|
|
|
"keywords": meta.get("keywords", ""),
|
|
|
|
|
|
"creator": meta.get("creator", ""),
|
|
|
|
|
|
"producer": meta.get("producer", ""),
|
|
|
|
|
|
"creation_date": meta.get("creationDate", ""),
|
|
|
|
|
|
"mod_date": meta.get("modDate", ""),
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"提取元数据失败: {e}")
|
|
|
|
|
|
return metadata
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_page(self, page: fitz.Page, page_num: int,
|
|
|
|
|
|
extract_tables: bool, extract_images: bool) -> PDFPageContent:
|
|
|
|
|
|
"""解析单页内容"""
|
|
|
|
|
|
page_content = PDFPageContent(page_number=page_num, text="")
|
|
|
|
|
|
|
|
|
|
|
|
# 提取文本块(保留结构)
|
|
|
|
|
|
blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
|
|
|
|
|
|
page_content.blocks = blocks
|
|
|
|
|
|
|
|
|
|
|
|
# 提取纯文本
|
|
|
|
|
|
text = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE)
|
|
|
|
|
|
page_content.text = text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
# 提取表格(使用PyMuPDF的表格提取功能)
|
|
|
|
|
|
if extract_tables:
|
|
|
|
|
|
tables = self._extract_tables_from_page(page)
|
|
|
|
|
|
page_content.tables = tables
|
|
|
|
|
|
|
|
|
|
|
|
# 提取图片
|
|
|
|
|
|
if extract_images:
|
|
|
|
|
|
images = self._extract_images_from_page(page, page_num)
|
|
|
|
|
|
page_content.images = images
|
|
|
|
|
|
|
|
|
|
|
|
return page_content
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_tables_from_page(self, page: fitz.Page) -> List[str]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
从页面提取表格(基于文本块分析)
|
|
|
|
|
|
注意:PyMuPDF基础版表格提取能力有限,复杂表格建议使用MinerU
|
|
|
|
|
|
"""
|
|
|
|
|
|
tables = []
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 使用PyMuPDF的表格提取方法(2.4+版本)
|
|
|
|
|
|
# 对于更复杂的表格,需要在mineru_parser中使用更高级的方法
|
|
|
|
|
|
tabs = page.find_tables()
|
|
|
|
|
|
if tabs:
|
|
|
|
|
|
for tab in tabs:
|
|
|
|
|
|
table_text = tab.extract()
|
|
|
|
|
|
# 将表格转换为Markdown格式
|
|
|
|
|
|
markdown_table = self._table_to_markdown(table_text)
|
|
|
|
|
|
tables.append(markdown_table)
|
|
|
|
|
|
|
|
|
|
|
|
except AttributeError:
|
|
|
|
|
|
# 旧版本PyMuPDF没有表格提取功能
|
|
|
|
|
|
logger.warning("PyMuPDF版本不支持表格提取,请升级到2.4+版本")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"表格提取失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
return tables
|
|
|
|
|
|
|
|
|
|
|
|
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
|
|
|
|
|
|
"""将表格数据转换为Markdown格式"""
|
|
|
|
|
|
if not table_data or len(table_data) < 1:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
lines = []
|
|
|
|
|
|
# 表头
|
|
|
|
|
|
if len(table_data) >= 1:
|
|
|
|
|
|
header = table_data[0]
|
|
|
|
|
|
lines.append("| " + " | ".join(str(cell).strip() for cell in header) + " |")
|
|
|
|
|
|
lines.append("| " + " | ".join("---" for _ in header) + " |")
|
|
|
|
|
|
|
|
|
|
|
|
# 数据行
|
|
|
|
|
|
for row in table_data[1:]:
|
|
|
|
|
|
lines.append("| " + " | ".join(str(cell).strip() for cell in row) + " |")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_images_from_page(self, page: fitz.Page, page_num: int) -> List[str]:
|
|
|
|
|
|
"""提取页面图片"""
|
|
|
|
|
|
images = []
|
|
|
|
|
|
# 图片提取功能(可选实现)
|
|
|
|
|
|
# 这里仅记录图片信息,实际图片需要额外保存
|
|
|
|
|
|
try:
|
|
|
|
|
|
image_list = page.get_images()
|
|
|
|
|
|
for img_index, img in enumerate(image_list):
|
|
|
|
|
|
xref = img[0]
|
|
|
|
|
|
images.append(f"image_p{page_num}_i{img_index}_xref{xref}")
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"图片提取失败: {e}")
|
|
|
|
|
|
return images
|
|
|
|
|
|
|
|
|
|
|
|
def _generate_markdown(self, doc_content: PDFDocumentContent) -> str:
|
|
|
|
|
|
"""生成Markdown格式文本"""
|
|
|
|
|
|
lines = []
|
|
|
|
|
|
|
|
|
|
|
|
# 文档标题
|
|
|
|
|
|
title = doc_content.metadata.get("title", "")
|
|
|
|
|
|
if title:
|
|
|
|
|
|
lines.append(f"# {title}\n")
|
|
|
|
|
|
else:
|
|
|
|
|
|
lines.append(f"# {doc_content.file_path}\n")
|
|
|
|
|
|
|
|
|
|
|
|
# 元数据信息
|
|
|
|
|
|
lines.append("\n## 文档信息\n")
|
|
|
|
|
|
for key, value in doc_content.metadata.items():
|
|
|
|
|
|
if value and key in ["author", "subject", "keywords", "creation_date"]:
|
|
|
|
|
|
lines.append(f"- **{key}**: {value}")
|
|
|
|
|
|
|
|
|
|
|
|
# 正文内容
|
|
|
|
|
|
lines.append("\n## 正文\n")
|
|
|
|
|
|
|
|
|
|
|
|
for page in doc_content.pages:
|
|
|
|
|
|
# 页码标记
|
|
|
|
|
|
lines.append(f"\n---\n**第 {page.page_number} 页**\n")
|
|
|
|
|
|
|
|
|
|
|
|
# 处理文本内容,识别标题结构
|
|
|
|
|
|
text = self._process_page_text(page.text, page.blocks)
|
|
|
|
|
|
lines.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
# 添加表格
|
|
|
|
|
|
for table in page.tables:
|
|
|
|
|
|
lines.append("\n" + table + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
def _process_page_text(self, text: str, blocks: List[Dict]) -> str:
|
|
|
|
|
|
"""处理页面文本,识别标题结构"""
|
|
|
|
|
|
# 基于字体大小识别标题
|
|
|
|
|
|
processed_text = text
|
|
|
|
|
|
|
|
|
|
|
|
# 尝试识别标题(基于字号)
|
|
|
|
|
|
# 法规文档通常有明确的层级结构:章、节、条
|
|
|
|
|
|
processed_text = self._detect_headers(text, blocks)
|
|
|
|
|
|
|
|
|
|
|
|
return processed_text
|
|
|
|
|
|
|
|
|
|
|
|
def _detect_headers(self, text: str, blocks: List[Dict]) -> str:
|
|
|
|
|
|
"""检测并标记标题(基于字号或内容模式)"""
|
|
|
|
|
|
lines = text.split("\n")
|
|
|
|
|
|
processed_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 法规标题模式检测
|
|
|
|
|
|
# 第一章、第X章、第X节、第X条等
|
|
|
|
|
|
if re.match(r'^第[一二三四五六七八九十百]+章\s', line):
|
|
|
|
|
|
processed_lines.append(f"\n## {line}\n")
|
|
|
|
|
|
elif re.match(r'^第[一二三四五六七八九十百]+节\s', line):
|
|
|
|
|
|
processed_lines.append(f"\n### {line}\n")
|
|
|
|
|
|
elif re.match(r'^第[一二三四五六七八九十百]+条\s', line):
|
|
|
|
|
|
processed_lines.append(f"\n#### {line}\n")
|
|
|
|
|
|
elif re.match(r'^[一二三四五六七八九十]+\s*[、.]', line):
|
|
|
|
|
|
# 条款子项
|
|
|
|
|
|
processed_lines.append(f"- {line}")
|
|
|
|
|
|
else:
|
|
|
|
|
|
processed_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(processed_lines)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_to_markdown(self, file_path: str) -> str:
|
|
|
|
|
|
"""直接解析并返回Markdown文本"""
|
|
|
|
|
|
doc_content = self.parse(file_path)
|
|
|
|
|
|
return doc_content.markdown_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_pdf(file_path: str, **kwargs) -> PDFDocumentContent:
|
|
|
|
|
|
"""便捷函数:解析PDF文档"""
|
|
|
|
|
|
parser = PDFParser()
|
|
|
|
|
|
return parser.parse(file_path, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_pdf_to_markdown(file_path: str) -> str:
|
|
|
|
|
|
"""便捷函数:解析PDF并返回Markdown"""
|
|
|
|
|
|
parser = PDFParser()
|
2026-05-14 18:09:15 +08:00
|
|
|
|
return parser.parse_to_markdown(file_path)
|