update
This commit is contained in:
7
backend/app/services/parser/__init__.py
Normal file
7
backend/app/services/parser/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
# src/services/parser/__init__.py
|
||||
"""文档解析服务"""
|
||||
|
||||
from .pdf_parser import PDFParser
|
||||
from .docx_parser import DocxParser
|
||||
|
||||
__all__ = ["PDFParser", "DocxParser"]
|
||||
287
backend/app/services/parser/docx_parser.py
Normal file
287
backend/app/services/parser/docx_parser.py
Normal file
@@ -0,0 +1,287 @@
|
||||
# src/services/parser/docx_parser.py
|
||||
"""Word文档解析 - 使用python-docx"""
|
||||
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocxParagraph:
|
||||
"""段落内容"""
|
||||
text: str
|
||||
level: int = 0 # 标题级别,0表示正文
|
||||
is_list: bool = False
|
||||
list_number: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocxTable:
|
||||
"""表格内容"""
|
||||
rows: List[List[str]]
|
||||
markdown: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocxDocumentContent:
|
||||
"""Word文档完整内容"""
|
||||
file_path: str
|
||||
paragraphs: List[DocxParagraph]
|
||||
tables: List[DocxTable]
|
||||
metadata: Dict[str, str] = field(default_factory=dict)
|
||||
markdown_text: str = ""
|
||||
|
||||
|
||||
class DocxParser:
|
||||
"""Word文档解析器 - 基于python-docx"""
|
||||
|
||||
def __init__(self):
|
||||
self.document = None
|
||||
|
||||
def parse(self, file_path: str) -> DocxDocumentContent:
|
||||
"""
|
||||
解析Word文档
|
||||
|
||||
Args:
|
||||
file_path: Word文档路径
|
||||
|
||||
Returns:
|
||||
DocxDocumentContent: 解析后的文档内容
|
||||
"""
|
||||
logger.info(f"开始解析Word文档: {file_path}")
|
||||
|
||||
try:
|
||||
self.document = Document(file_path)
|
||||
doc_content = DocxDocumentContent(
|
||||
file_path=file_path,
|
||||
paragraphs=[],
|
||||
tables=[]
|
||||
)
|
||||
|
||||
# 提取文档元数据
|
||||
doc_content.metadata = self._extract_metadata()
|
||||
|
||||
# 提取段落
|
||||
doc_content.paragraphs = self._extract_paragraphs()
|
||||
|
||||
# 提取表格
|
||||
doc_content.tables = self._extract_tables()
|
||||
|
||||
# 生成Markdown格式文本
|
||||
doc_content.markdown_text = self._generate_markdown(doc_content)
|
||||
|
||||
logger.success(f"Word文档解析完成,共{len(doc_content.paragraphs)}个段落")
|
||||
|
||||
return doc_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Word文档解析失败: {e}")
|
||||
raise
|
||||
|
||||
def _extract_metadata(self) -> Dict[str, str]:
|
||||
"""提取文档元数据"""
|
||||
metadata = {}
|
||||
try:
|
||||
core_props = self.document.core_properties
|
||||
metadata = {
|
||||
"title": core_props.title or "",
|
||||
"author": core_props.author or "",
|
||||
"subject": core_props.subject or "",
|
||||
"keywords": core_props.keywords or "",
|
||||
"created": str(core_props.created) if core_props.created else "",
|
||||
"modified": str(core_props.modified) if core_props.modified else "",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"提取元数据失败: {e}")
|
||||
return metadata
|
||||
|
||||
def _extract_paragraphs(self) -> List[DocxParagraph]:
|
||||
"""提取所有段落"""
|
||||
paragraphs = []
|
||||
|
||||
for para in self.document.paragraphs:
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# 判断标题级别
|
||||
level = self._get_paragraph_level(para)
|
||||
|
||||
# 判断是否是列表项
|
||||
is_list, list_number = self._detect_list_item(para)
|
||||
|
||||
paragraph = DocxParagraph(
|
||||
text=text,
|
||||
level=level,
|
||||
is_list=is_list,
|
||||
list_number=list_number
|
||||
)
|
||||
paragraphs.append(paragraph)
|
||||
|
||||
return paragraphs
|
||||
|
||||
def _get_paragraph_level(self, para) -> int:
|
||||
"""
|
||||
判断段落标题级别
|
||||
|
||||
Returns:
|
||||
int: 标题级别,0表示正文
|
||||
"""
|
||||
# 方法1:检查段落样式
|
||||
style_name = para.style.name if para.style else ""
|
||||
|
||||
if "Heading" in style_name or "标题" in style_name:
|
||||
# 从样式名称中提取级别
|
||||
match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
|
||||
if match:
|
||||
level = int(match.group(1) or match.group(2))
|
||||
return level
|
||||
|
||||
# 方法2:检查段落格式(字号)
|
||||
# 标题通常字号较大
|
||||
if para.paragraph_format:
|
||||
# 可以根据字号判断,这里简化处理
|
||||
pass
|
||||
|
||||
# 方法3:根据内容模式判断(法规文档特征)
|
||||
text = para.text.strip()
|
||||
|
||||
# 第一章、第X章 -> 二级标题
|
||||
if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
|
||||
return 2
|
||||
# 第X节 -> 三级标题
|
||||
elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
|
||||
return 3
|
||||
# 第X条 -> 四级标题
|
||||
elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
|
||||
return 4
|
||||
|
||||
return 0 # 正文
|
||||
|
||||
def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
|
||||
"""检测是否是列表项"""
|
||||
text = para.text.strip()
|
||||
|
||||
# 数字列表:1.、2.、(1)、[1]等
|
||||
if re.match(r'^[\d]+[.、)\]]\s', text):
|
||||
match = re.match(r'^([\d]+[.、)\]])\s', text)
|
||||
return True, match.group(1) if match else None
|
||||
|
||||
# 中文数字列表:一、二、(一)等
|
||||
if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
|
||||
match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
|
||||
return True, match.group(1) if match else None
|
||||
|
||||
# 检查段落格式中的列表编号
|
||||
if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
|
||||
# 有缩进的可能是列表项
|
||||
pass
|
||||
|
||||
return False, None
|
||||
|
||||
def _extract_tables(self) -> List[DocxTable]:
|
||||
"""提取所有表格"""
|
||||
tables = []
|
||||
|
||||
for table in self.document.tables:
|
||||
rows = []
|
||||
for row in table.rows:
|
||||
cells = []
|
||||
for cell in row.cells:
|
||||
cells.append(cell.text.strip())
|
||||
rows.append(cells)
|
||||
|
||||
# 转换为Markdown表格
|
||||
markdown = self._table_to_markdown(rows)
|
||||
|
||||
table_content = DocxTable(rows=rows, markdown=markdown)
|
||||
tables.append(table_content)
|
||||
|
||||
return tables
|
||||
|
||||
def _table_to_markdown(self, rows: List[List[str]]) -> str:
|
||||
"""将表格转换为Markdown格式"""
|
||||
if not rows or len(rows) < 1:
|
||||
return ""
|
||||
|
||||
lines = []
|
||||
|
||||
# 表头
|
||||
if len(rows) >= 1:
|
||||
header = rows[0]
|
||||
lines.append("| " + " | ".join(cell for cell in header) + " |")
|
||||
lines.append("| " + " | ".join("---" for _ in header) + " |")
|
||||
|
||||
# 数据行
|
||||
for row in rows[1:]:
|
||||
lines.append("| " + " | ".join(cell for cell in row) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
|
||||
"""生成Markdown格式文本"""
|
||||
lines = []
|
||||
|
||||
# 文档标题
|
||||
title = doc_content.metadata.get("title", "")
|
||||
if title:
|
||||
lines.append(f"# {title}\n")
|
||||
else:
|
||||
# 从第一个段落获取标题(如果是标题样式)
|
||||
for para in doc_content.paragraphs[:5]:
|
||||
if para.level == 1:
|
||||
lines.append(f"# {para.text}\n")
|
||||
break
|
||||
else:
|
||||
lines.append(f"# {doc_content.file_path}\n")
|
||||
|
||||
# 元数据信息
|
||||
lines.append("\n## 文档信息\n")
|
||||
for key, value in doc_content.metadata.items():
|
||||
if value:
|
||||
lines.append(f"- **{key}**: {value}")
|
||||
|
||||
# 正文内容
|
||||
lines.append("\n## 正文\n")
|
||||
|
||||
table_index = 0
|
||||
for para in doc_content.paragraphs:
|
||||
if para.level > 0:
|
||||
# 标题
|
||||
prefix = "#" * para.level
|
||||
lines.append(f"\n{prefix} {para.text}\n")
|
||||
elif para.is_list:
|
||||
# 列表项
|
||||
lines.append(f"- {para.text}")
|
||||
else:
|
||||
# 正文
|
||||
lines.append(para.text)
|
||||
|
||||
# 添加表格
|
||||
if doc_content.tables:
|
||||
lines.append("\n## 表格\n")
|
||||
for i, table in enumerate(doc_content.tables):
|
||||
lines.append(f"\n### 表格 {i + 1}\n")
|
||||
lines.append(table.markdown + "\n")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""直接解析并返回Markdown文本"""
|
||||
doc_content = self.parse(file_path)
|
||||
return doc_content.markdown_text
|
||||
|
||||
|
||||
def parse_docx(file_path: str) -> DocxDocumentContent:
|
||||
"""便捷函数:解析Word文档"""
|
||||
parser = DocxParser()
|
||||
return parser.parse(file_path)
|
||||
|
||||
|
||||
def parse_docx_to_markdown(file_path: str) -> str:
|
||||
"""便捷函数:解析Word并返回Markdown"""
|
||||
parser = DocxParser()
|
||||
return parser.parse_to_markdown(file_path)
|
||||
204
backend/app/services/parser/mineru_parser.py
Normal file
204
backend/app/services/parser/mineru_parser.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# src/services/parser/mineru_parser.py
|
||||
"""MinerU多模态PDF解析 - 版面感知解析"""
|
||||
|
||||
from typing import Optional, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
import os
|
||||
|
||||
|
||||
@dataclass
|
||||
class MinerUResult:
|
||||
"""MinerU解析结果"""
|
||||
file_path: str
|
||||
markdown_text: str
|
||||
metadata: Dict[str, str] = field(default_factory=dict)
|
||||
success: bool = True
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
class MinerUParser:
|
||||
"""
|
||||
MinerU多模态PDF解析器
|
||||
|
||||
MinerU (magic-pdf) 是一个开源的高质量PDF解析工具,
|
||||
支持版面感知解析,能够识别文档中的标题、正文、表格、图片等元素,
|
||||
并输出结构化的Markdown格式。
|
||||
|
||||
GitHub: https://github.com/opendatalab/MinerU
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.available = self._check_mineru_available()
|
||||
|
||||
def _check_mineru_available(self) -> bool:
|
||||
"""检查MinerU是否可用"""
|
||||
try:
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
return True
|
||||
except ImportError:
|
||||
logger.warning("MinerU (magic-pdf) 未安装,请运行: pip install magic-pdf[full]")
|
||||
return False
|
||||
|
||||
def parse(self, file_path: str, output_dir: Optional[str] = None) -> MinerUResult:
|
||||
"""
|
||||
使用MinerU解析PDF文档
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
output_dir: 输出目录(可选,用于保存解析产物)
|
||||
|
||||
Returns:
|
||||
MinerUResult: 解析结果
|
||||
"""
|
||||
logger.info(f"尝试使用MinerU解析: {file_path}")
|
||||
|
||||
if not self.available:
|
||||
return MinerUResult(
|
||||
file_path=file_path,
|
||||
markdown_text="",
|
||||
success=False,
|
||||
error_message="MinerU未安装"
|
||||
)
|
||||
|
||||
try:
|
||||
from magic_pdf.pipe.UNIPipe import UNIPipe
|
||||
from magic_pdf.libs.MakeContentConfig import DropMode
|
||||
|
||||
# 设置输出目录
|
||||
if output_dir is None:
|
||||
output_dir = os.path.dirname(file_path)
|
||||
|
||||
# 创建解析管道
|
||||
# OCR模式可以根据PDF类型选择
|
||||
# auto: 自动判断是否需要OCR
|
||||
# txt: 纯文本PDF(无OCR)
|
||||
# ocr: 扫描件PDF(OCR)
|
||||
pipe = UNIPipe(file_path, output_dir)
|
||||
|
||||
# 执行解析
|
||||
# pipe_mk() 返回Markdown格式文本
|
||||
markdown_content = pipe.pipe_mk()
|
||||
|
||||
logger.success(f"MinerU解析成功")
|
||||
|
||||
return MinerUResult(
|
||||
file_path=file_path,
|
||||
markdown_text=markdown_content,
|
||||
metadata=self._extract_metadata(pipe),
|
||||
success=True
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"MinerU解析失败: {e}")
|
||||
return MinerUResult(
|
||||
file_path=file_path,
|
||||
markdown_text="",
|
||||
success=False,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
def _extract_metadata(self, pipe) -> Dict[str, str]:
|
||||
"""从解析管道提取元数据"""
|
||||
metadata = {}
|
||||
try:
|
||||
# MinerU解析管道中可能包含的元数据信息
|
||||
if hasattr(pipe, 'pdf_mid_data') and pipe.pdf_mid_data:
|
||||
mid_data = pipe.pdf_mid_data
|
||||
# 提取可能的元数据字段
|
||||
metadata = {
|
||||
"page_count": str(mid_data.get("page_count", "")),
|
||||
"language": str(mid_data.get("language", "")),
|
||||
"is_scanned": str(mid_data.get("is_scanned", "")),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"提取MinerU元数据失败: {e}")
|
||||
|
||||
return metadata
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""直接解析并返回Markdown文本"""
|
||||
result = self.parse(file_path)
|
||||
return result.markdown_text if result.success else ""
|
||||
|
||||
|
||||
class ParserOrchestrator:
|
||||
"""
|
||||
解析服务编排 - 按优先级选择解析器
|
||||
|
||||
解析策略:
|
||||
1. 优先尝试MinerU(版面感知能力强)
|
||||
2. MinerU失败时回退到基础PyMuPDF解析
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
from .pdf_parser import PDFParser
|
||||
self.mineru_parser = MinerUParser()
|
||||
self.pdf_parser = PDFParser()
|
||||
self.mineru_available = self.mineru_parser.available
|
||||
|
||||
def parse_pdf(self, file_path: str, prefer_mineru: bool = True) -> str:
|
||||
"""
|
||||
解析PDF文档,按优先级选择解析器
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
prefer_mineru: 是否优先使用MinerU
|
||||
|
||||
Returns:
|
||||
str: Markdown格式文本
|
||||
"""
|
||||
markdown_text = ""
|
||||
|
||||
if prefer_mineru and self.mineru_available:
|
||||
# 优先尝试MinerU
|
||||
result = self.mineru_parser.parse(file_path)
|
||||
if result.success:
|
||||
markdown_text = result.markdown_text
|
||||
logger.info("使用MinerU解析成功")
|
||||
return markdown_text
|
||||
else:
|
||||
logger.warning(f"MinerU解析失败,回退到PyMuPDF: {result.error_message}")
|
||||
|
||||
# 回退到PyMuPDF基础解析
|
||||
logger.info("使用PyMuPDF基础解析")
|
||||
markdown_text = self.pdf_parser.parse_to_markdown(file_path)
|
||||
|
||||
return markdown_text
|
||||
|
||||
def parse_docx(self, file_path: str) -> str:
|
||||
"""解析Word文档"""
|
||||
from .docx_parser import DocxParser
|
||||
docx_parser = DocxParser()
|
||||
return docx_parser.parse_to_markdown(file_path)
|
||||
|
||||
def parse(self, file_path: str) -> str:
|
||||
"""
|
||||
根据文件类型选择解析器
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
str: Markdown格式文本
|
||||
"""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == ".pdf":
|
||||
return self.parse_pdf(file_path)
|
||||
elif ext in [".docx", ".doc"]:
|
||||
return self.parse_docx(file_path)
|
||||
else:
|
||||
raise ValueError(f"不支持的文件类型: {ext}")
|
||||
|
||||
|
||||
def parse_with_mineru(file_path: str) -> MinerUResult:
|
||||
"""便捷函数:使用MinerU解析"""
|
||||
parser = MinerUParser()
|
||||
return parser.parse(file_path)
|
||||
|
||||
|
||||
def parse_pdf_smart(file_path: str) -> str:
|
||||
"""便捷函数:智能解析PDF(自动选择最佳解析器)"""
|
||||
orchestrator = ParserOrchestrator()
|
||||
return orchestrator.parse_pdf(file_path)
|
||||
268
backend/app/services/parser/pdf_parser.py
Normal file
268
backend/app/services/parser/pdf_parser.py
Normal file
@@ -0,0 +1,268 @@
|
||||
# src/services/parser/pdf_parser.py
|
||||
"""PDF文档解析 - 使用PyMuPDF基础解析"""
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class PDFPageContent:
|
||||
"""PDF页面内容"""
|
||||
page_number: int
|
||||
text: str
|
||||
tables: List[str] = field(default_factory=list)
|
||||
images: List[str] = field(default_factory=list) # 图片路径列表
|
||||
blocks: List[Dict] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PDFDocumentContent:
|
||||
"""PDF文档完整内容"""
|
||||
file_path: str
|
||||
total_pages: int
|
||||
pages: List[PDFPageContent]
|
||||
metadata: Dict[str, str] = field(default_factory=dict)
|
||||
markdown_text: str = ""
|
||||
|
||||
|
||||
class PDFParser:
|
||||
"""PDF文档解析器 - 基于PyMuPDF"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdf = None
|
||||
|
||||
def parse(self, file_path: str, extract_tables: bool = True, extract_images: bool = False) -> PDFDocumentContent:
|
||||
"""
|
||||
解析PDF文档
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
extract_tables: 是否提取表格
|
||||
extract_images: 是否提取图片
|
||||
|
||||
Returns:
|
||||
PDFDocumentContent: 解析后的文档内容
|
||||
"""
|
||||
logger.info(f"开始解析PDF文档: {file_path}")
|
||||
|
||||
try:
|
||||
self.pdf = fitz.open(file_path)
|
||||
doc_content = PDFDocumentContent(
|
||||
file_path=file_path,
|
||||
total_pages=self.pdf.page_count,
|
||||
pages=[]
|
||||
)
|
||||
|
||||
# 提取文档元数据
|
||||
doc_content.metadata = self._extract_metadata()
|
||||
|
||||
# 逐页解析
|
||||
for page_num in range(self.pdf.page_count):
|
||||
page = self.pdf[page_num]
|
||||
page_content = self._parse_page(page, page_num + 1, extract_tables, extract_images)
|
||||
doc_content.pages.append(page_content)
|
||||
|
||||
# 生成Markdown格式文本
|
||||
doc_content.markdown_text = self._generate_markdown(doc_content)
|
||||
|
||||
self.pdf.close()
|
||||
logger.success(f"PDF解析完成,共{doc_content.total_pages}页")
|
||||
|
||||
return doc_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF解析失败: {e}")
|
||||
raise
|
||||
|
||||
def _extract_metadata(self) -> Dict[str, str]:
|
||||
"""提取PDF元数据"""
|
||||
metadata = {}
|
||||
try:
|
||||
meta = self.pdf.metadata
|
||||
metadata = {
|
||||
"title": meta.get("title", ""),
|
||||
"author": meta.get("author", ""),
|
||||
"subject": meta.get("subject", ""),
|
||||
"keywords": meta.get("keywords", ""),
|
||||
"creator": meta.get("creator", ""),
|
||||
"producer": meta.get("producer", ""),
|
||||
"creation_date": meta.get("creationDate", ""),
|
||||
"mod_date": meta.get("modDate", ""),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"提取元数据失败: {e}")
|
||||
return metadata
|
||||
|
||||
def _parse_page(self, page: fitz.Page, page_num: int,
|
||||
extract_tables: bool, extract_images: bool) -> PDFPageContent:
|
||||
"""解析单页内容"""
|
||||
page_content = PDFPageContent(page_number=page_num, text="")
|
||||
|
||||
# 提取文本块(保留结构)
|
||||
blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
|
||||
page_content.blocks = blocks
|
||||
|
||||
# 提取纯文本
|
||||
text = page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE)
|
||||
page_content.text = text.strip()
|
||||
|
||||
# 提取表格(使用PyMuPDF的表格提取功能)
|
||||
if extract_tables:
|
||||
tables = self._extract_tables_from_page(page)
|
||||
page_content.tables = tables
|
||||
|
||||
# 提取图片
|
||||
if extract_images:
|
||||
images = self._extract_images_from_page(page, page_num)
|
||||
page_content.images = images
|
||||
|
||||
return page_content
|
||||
|
||||
def _extract_tables_from_page(self, page: fitz.Page) -> List[str]:
|
||||
"""
|
||||
从页面提取表格(基于文本块分析)
|
||||
注意:PyMuPDF基础版表格提取能力有限,复杂表格建议使用MinerU
|
||||
"""
|
||||
tables = []
|
||||
|
||||
try:
|
||||
# 使用PyMuPDF的表格提取方法(2.4+版本)
|
||||
# 对于更复杂的表格,需要在mineru_parser中使用更高级的方法
|
||||
tabs = page.find_tables()
|
||||
if tabs:
|
||||
for tab in tabs:
|
||||
table_text = tab.extract()
|
||||
# 将表格转换为Markdown格式
|
||||
markdown_table = self._table_to_markdown(table_text)
|
||||
tables.append(markdown_table)
|
||||
|
||||
except AttributeError:
|
||||
# 旧版本PyMuPDF没有表格提取功能
|
||||
logger.warning("PyMuPDF版本不支持表格提取,请升级到2.4+版本")
|
||||
except Exception as e:
|
||||
logger.warning(f"表格提取失败: {e}")
|
||||
|
||||
return tables
|
||||
|
||||
def _table_to_markdown(self, table_data: List[List[str]]) -> str:
|
||||
"""将表格数据转换为Markdown格式"""
|
||||
if not table_data or len(table_data) < 1:
|
||||
return ""
|
||||
|
||||
lines = []
|
||||
# 表头
|
||||
if len(table_data) >= 1:
|
||||
header = table_data[0]
|
||||
lines.append("| " + " | ".join(str(cell).strip() for cell in header) + " |")
|
||||
lines.append("| " + " | ".join("---" for _ in header) + " |")
|
||||
|
||||
# 数据行
|
||||
for row in table_data[1:]:
|
||||
lines.append("| " + " | ".join(str(cell).strip() for cell in row) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _extract_images_from_page(self, page: fitz.Page, page_num: int) -> List[str]:
|
||||
"""提取页面图片"""
|
||||
images = []
|
||||
# 图片提取功能(可选实现)
|
||||
# 这里仅记录图片信息,实际图片需要额外保存
|
||||
try:
|
||||
image_list = page.get_images()
|
||||
for img_index, img in enumerate(image_list):
|
||||
xref = img[0]
|
||||
images.append(f"image_p{page_num}_i{img_index}_xref{xref}")
|
||||
except Exception as e:
|
||||
logger.warning(f"图片提取失败: {e}")
|
||||
return images
|
||||
|
||||
def _generate_markdown(self, doc_content: PDFDocumentContent) -> str:
|
||||
"""生成Markdown格式文本"""
|
||||
lines = []
|
||||
|
||||
# 文档标题
|
||||
title = doc_content.metadata.get("title", "")
|
||||
if title:
|
||||
lines.append(f"# {title}\n")
|
||||
else:
|
||||
lines.append(f"# {doc_content.file_path}\n")
|
||||
|
||||
# 元数据信息
|
||||
lines.append("\n## 文档信息\n")
|
||||
for key, value in doc_content.metadata.items():
|
||||
if value and key in ["author", "subject", "keywords", "creation_date"]:
|
||||
lines.append(f"- **{key}**: {value}")
|
||||
|
||||
# 正文内容
|
||||
lines.append("\n## 正文\n")
|
||||
|
||||
for page in doc_content.pages:
|
||||
# 页码标记
|
||||
lines.append(f"\n---\n**第 {page.page_number} 页**\n")
|
||||
|
||||
# 处理文本内容,识别标题结构
|
||||
text = self._process_page_text(page.text, page.blocks)
|
||||
lines.append(text)
|
||||
|
||||
# 添加表格
|
||||
for table in page.tables:
|
||||
lines.append("\n" + table + "\n")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _process_page_text(self, text: str, blocks: List[Dict]) -> str:
|
||||
"""处理页面文本,识别标题结构"""
|
||||
# 基于字体大小识别标题
|
||||
processed_text = text
|
||||
|
||||
# 尝试识别标题(基于字号)
|
||||
# 法规文档通常有明确的层级结构:章、节、条
|
||||
processed_text = self._detect_headers(text, blocks)
|
||||
|
||||
return processed_text
|
||||
|
||||
def _detect_headers(self, text: str, blocks: List[Dict]) -> str:
|
||||
"""检测并标记标题(基于字号或内容模式)"""
|
||||
lines = text.split("\n")
|
||||
processed_lines = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# 法规标题模式检测
|
||||
# 第一章、第X章、第X节、第X条等
|
||||
if re.match(r'^第[一二三四五六七八九十百]+章\s', line):
|
||||
processed_lines.append(f"\n## {line}\n")
|
||||
elif re.match(r'^第[一二三四五六七八九十百]+节\s', line):
|
||||
processed_lines.append(f"\n### {line}\n")
|
||||
elif re.match(r'^第[一二三四五六七八九十百]+条\s', line):
|
||||
processed_lines.append(f"\n#### {line}\n")
|
||||
elif re.match(r'^[一二三四五六七八九十]+\s*[、.]', line):
|
||||
# 条款子项
|
||||
processed_lines.append(f"- {line}")
|
||||
else:
|
||||
processed_lines.append(line)
|
||||
|
||||
return "\n".join(processed_lines)
|
||||
|
||||
def parse_to_markdown(self, file_path: str) -> str:
|
||||
"""直接解析并返回Markdown文本"""
|
||||
doc_content = self.parse(file_path)
|
||||
return doc_content.markdown_text
|
||||
|
||||
|
||||
def parse_pdf(file_path: str, **kwargs) -> PDFDocumentContent:
|
||||
"""便捷函数:解析PDF文档"""
|
||||
parser = PDFParser()
|
||||
return parser.parse(file_path, **kwargs)
|
||||
|
||||
|
||||
def parse_pdf_to_markdown(file_path: str) -> str:
|
||||
"""便捷函数:解析PDF并返回Markdown"""
|
||||
parser = PDFParser()
|
||||
return parser.parse_to_markdown(file_path)
|
||||
Reference in New Issue
Block a user