Files
AIRegulation-DocAnalysis/backend/app/services/parser/pdf_parser.py

268 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""PDF文档解析 - 使用PyMuPDF基础解析"""
import fitz # PyMuPDF
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, field
from loguru import logger
import re
@dataclass
class PDFPageContent:
"""PDF页面内容"""
page_number: int
text: str
tables: List[str] = field(default_factory=list)
images: List[str] = field(default_factory=list) # 图片路径列表
blocks: List[Dict] = field(default_factory=list)
@dataclass
class PDFDocumentContent:
"""PDF文档完整内容"""
file_path: str
total_pages: int
pages: List[PDFPageContent]
metadata: Dict[str, str] = field(default_factory=dict)
markdown_text: str = ""
class PDFParser:
"""PDF文档解析器 - 基于PyMuPDF"""
def __init__(self):
self.pdf = None
def parse(self, file_path: str, extract_tables: bool = True, extract_images: bool = False) -> PDFDocumentContent:
"""
解析PDF文档
Args:
file_path: PDF文件路径
extract_tables: 是否提取表格
extract_images: 是否提取图片
Returns:
PDFDocumentContent: 解析后的文档内容
"""
logger.info(f"开始解析PDF文档: {file_path}")
try:
self.pdf = fitz.open(file_path)
doc_content = PDFDocumentContent(
file_path=file_path,
total_pages=self.pdf.page_count,
pages=[]
)
# 提取文档元数据
doc_content.metadata = self._extract_metadata()
# 逐页解析
for page_num in range(self.pdf.page_count):
page = self.pdf[page_num]
page_content = self._parse_page(page, page_num + 1, extract_tables, extract_images)
doc_content.pages.append(page_content)
# 生成Markdown格式文本
doc_content.markdown_text = self._generate_markdown(doc_content)
self.pdf.close()
logger.success(f"PDF解析完成{doc_content.total_pages}")
return doc_content
except Exception as e:
logger.error(f"PDF解析失败: {e}")
raise
def _extract_metadata(self) -> Dict[str, str]:
"""提取PDF元数据"""
metadata = {}
try:
meta = self.pdf.metadata
metadata = {
"title": meta.get("title", ""),
"author": meta.get("author", ""),
"subject": meta.get("subject", ""),
"keywords": meta.get("keywords", ""),
"creator": meta.get("creator", ""),
"producer": meta.get("producer", ""),
"creation_date": meta.get("creationDate", ""),
"mod_date": meta.get("modDate", ""),
}
except Exception as e:
logger.warning(f"提取元数据失败: {e}")
return metadata
def _parse_page(self, page: fitz.Page, page_num: int,
extract_tables: bool, extract_images: bool) -> PDFPageContent:
"""解析单页内容"""
page_content = PDFPageContent(page_number=page_num, text="")
# 提取文本块(保留结构)
blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
page_content.blocks = blocks
# 提取纯文本
text = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE)
page_content.text = text.strip()
# 提取表格使用PyMuPDF的表格提取功能
if extract_tables:
tables = self._extract_tables_from_page(page)
page_content.tables = tables
# 提取图片
if extract_images:
images = self._extract_images_from_page(page, page_num)
page_content.images = images
return page_content
def _extract_tables_from_page(self, page: fitz.Page) -> List[str]:
"""
从页面提取表格(基于文本块分析)
注意PyMuPDF基础版表格提取能力有限复杂表格建议使用MinerU
"""
tables = []
try:
# 使用PyMuPDF的表格提取方法2.4+版本)
# 对于更复杂的表格需要在mineru_parser中使用更高级的方法
tabs = page.find_tables()
if tabs:
for tab in tabs:
table_text = tab.extract()
# 将表格转换为Markdown格式
markdown_table = self._table_to_markdown(table_text)
tables.append(markdown_table)
except AttributeError:
# 旧版本PyMuPDF没有表格提取功能
logger.warning("PyMuPDF版本不支持表格提取请升级到2.4+版本")
except Exception as e:
logger.warning(f"表格提取失败: {e}")
return tables
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
"""将表格数据转换为Markdown格式"""
if not table_data or len(table_data) < 1:
return ""
lines = []
# 表头
if len(table_data) >= 1:
header = table_data[0]
lines.append("| " + " | ".join(str(cell).strip() for cell in header) + " |")
lines.append("| " + " | ".join("---" for _ in header) + " |")
# 数据行
for row in table_data[1:]:
lines.append("| " + " | ".join(str(cell).strip() for cell in row) + " |")
return "\n".join(lines)
def _extract_images_from_page(self, page: fitz.Page, page_num: int) -> List[str]:
"""提取页面图片"""
images = []
# 图片提取功能(可选实现)
# 这里仅记录图片信息,实际图片需要额外保存
try:
image_list = page.get_images()
for img_index, img in enumerate(image_list):
xref = img[0]
images.append(f"image_p{page_num}_i{img_index}_xref{xref}")
except Exception as e:
logger.warning(f"图片提取失败: {e}")
return images
def _generate_markdown(self, doc_content: PDFDocumentContent) -> str:
"""生成Markdown格式文本"""
lines = []
# 文档标题
title = doc_content.metadata.get("title", "")
if title:
lines.append(f"# {title}\n")
else:
lines.append(f"# {doc_content.file_path}\n")
# 元数据信息
lines.append("\n## 文档信息\n")
for key, value in doc_content.metadata.items():
if value and key in ["author", "subject", "keywords", "creation_date"]:
lines.append(f"- **{key}**: {value}")
# 正文内容
lines.append("\n## 正文\n")
for page in doc_content.pages:
# 页码标记
lines.append(f"\n---\n**第 {page.page_number} 页**\n")
# 处理文本内容,识别标题结构
text = self._process_page_text(page.text, page.blocks)
lines.append(text)
# 添加表格
for table in page.tables:
lines.append("\n" + table + "\n")
return "\n".join(lines)
def _process_page_text(self, text: str, blocks: List[Dict]) -> str:
"""处理页面文本,识别标题结构"""
# 基于字体大小识别标题
processed_text = text
# 尝试识别标题(基于字号)
# 法规文档通常有明确的层级结构:章、节、条
processed_text = self._detect_headers(text, blocks)
return processed_text
def _detect_headers(self, text: str, blocks: List[Dict]) -> str:
"""检测并标记标题(基于字号或内容模式)"""
lines = text.split("\n")
processed_lines = []
for line in lines:
line = line.strip()
if not line:
continue
# 法规标题模式检测
# 第一章、第X章、第X节、第X条等
if re.match(r'^第[一二三四五六七八九十百]+章\s', line):
processed_lines.append(f"\n## {line}\n")
elif re.match(r'^第[一二三四五六七八九十百]+节\s', line):
processed_lines.append(f"\n### {line}\n")
elif re.match(r'^第[一二三四五六七八九十百]+条\s', line):
processed_lines.append(f"\n#### {line}\n")
elif re.match(r'^[一二三四五六七八九十]+\s*[、.]', line):
# 条款子项
processed_lines.append(f"- {line}")
else:
processed_lines.append(line)
return "\n".join(processed_lines)
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
doc_content = self.parse(file_path)
return doc_content.markdown_text
def parse_pdf(file_path: str, **kwargs) -> PDFDocumentContent:
"""便捷函数解析PDF文档"""
parser = PDFParser()
return parser.parse(file_path, **kwargs)
def parse_pdf_to_markdown(file_path: str) -> str:
"""便捷函数解析PDF并返回Markdown"""
parser = PDFParser()
return parser.parse_to_markdown(file_path)