Files
AIRegulation-DocAnalysis/backend/app/services/parser/docx_parser.py

287 lines
8.7 KiB
Python
Raw Normal View History

2026-05-14 15:07:34 +08:00
"""Word文档解析 - 使用python-docx"""
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import List, Dict, Optional
from dataclasses import dataclass, field
from loguru import logger
import re
@dataclass
class DocxParagraph:
"""段落内容"""
text: str
level: int = 0 # 标题级别0表示正文
is_list: bool = False
list_number: Optional[str] = None
@dataclass
class DocxTable:
"""表格内容"""
rows: List[List[str]]
markdown: str = ""
@dataclass
class DocxDocumentContent:
"""Word文档完整内容"""
file_path: str
paragraphs: List[DocxParagraph]
tables: List[DocxTable]
metadata: Dict[str, str] = field(default_factory=dict)
markdown_text: str = ""
class DocxParser:
"""Word文档解析器 - 基于python-docx"""
def __init__(self):
self.document = None
def parse(self, file_path: str) -> DocxDocumentContent:
"""
解析Word文档
Args:
file_path: Word文档路径
Returns:
DocxDocumentContent: 解析后的文档内容
"""
logger.info(f"开始解析Word文档: {file_path}")
try:
self.document = Document(file_path)
doc_content = DocxDocumentContent(
file_path=file_path,
paragraphs=[],
tables=[]
)
# 提取文档元数据
doc_content.metadata = self._extract_metadata()
# 提取段落
doc_content.paragraphs = self._extract_paragraphs()
# 提取表格
doc_content.tables = self._extract_tables()
# 生成Markdown格式文本
doc_content.markdown_text = self._generate_markdown(doc_content)
logger.success(f"Word文档解析完成{len(doc_content.paragraphs)}个段落")
return doc_content
except Exception as e:
logger.error(f"Word文档解析失败: {e}")
raise
def _extract_metadata(self) -> Dict[str, str]:
"""提取文档元数据"""
metadata = {}
try:
core_props = self.document.core_properties
metadata = {
"title": core_props.title or "",
"author": core_props.author or "",
"subject": core_props.subject or "",
"keywords": core_props.keywords or "",
"created": str(core_props.created) if core_props.created else "",
"modified": str(core_props.modified) if core_props.modified else "",
}
except Exception as e:
logger.warning(f"提取元数据失败: {e}")
return metadata
def _extract_paragraphs(self) -> List[DocxParagraph]:
"""提取所有段落"""
paragraphs = []
for para in self.document.paragraphs:
text = para.text.strip()
if not text:
continue
# 判断标题级别
level = self._get_paragraph_level(para)
# 判断是否是列表项
is_list, list_number = self._detect_list_item(para)
paragraph = DocxParagraph(
text=text,
level=level,
is_list=is_list,
list_number=list_number
)
paragraphs.append(paragraph)
return paragraphs
def _get_paragraph_level(self, para) -> int:
"""
判断段落标题级别
Returns:
int: 标题级别0表示正文
"""
# 方法1检查段落样式
style_name = para.style.name if para.style else ""
if "Heading" in style_name or "标题" in style_name:
# 从样式名称中提取级别
match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
if match:
level = int(match.group(1) or match.group(2))
return level
# 方法2检查段落格式字号
# 标题通常字号较大
if para.paragraph_format:
# 可以根据字号判断,这里简化处理
pass
# 方法3根据内容模式判断法规文档特征
text = para.text.strip()
# 第一章、第X章 -> 二级标题
if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
return 2
# 第X节 -> 三级标题
elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
return 3
# 第X条 -> 四级标题
elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
return 4
return 0 # 正文
def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
"""检测是否是列表项"""
text = para.text.strip()
# 数字列表1.、2.、1、[1]等
if re.match(r'^[\d]+[.、)\]]\s', text):
match = re.match(r'^([\d]+[.、)\]])\s', text)
return True, match.group(1) if match else None
# 中文数字列表:一、二、(一)等
if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
return True, match.group(1) if match else None
# 检查段落格式中的列表编号
if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
# 有缩进的可能是列表项
pass
return False, None
def _extract_tables(self) -> List[DocxTable]:
"""提取所有表格"""
tables = []
for table in self.document.tables:
rows = []
for row in table.rows:
cells = []
for cell in row.cells:
cells.append(cell.text.strip())
rows.append(cells)
# 转换为Markdown表格
markdown = self._table_to_markdown(rows)
table_content = DocxTable(rows=rows, markdown=markdown)
tables.append(table_content)
return tables
def _table_to_markdown(self, rows: List[List[str]]) -> str:
"""将表格转换为Markdown格式"""
if not rows or len(rows) < 1:
return ""
lines = []
# 表头
if len(rows) >= 1:
header = rows[0]
lines.append("| " + " | ".join(cell for cell in header) + " |")
lines.append("| " + " | ".join("---" for _ in header) + " |")
# 数据行
for row in rows[1:]:
lines.append("| " + " | ".join(cell for cell in row) + " |")
return "\n".join(lines)
def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
"""生成Markdown格式文本"""
lines = []
# 文档标题
title = doc_content.metadata.get("title", "")
if title:
lines.append(f"# {title}\n")
else:
# 从第一个段落获取标题(如果是标题样式)
for para in doc_content.paragraphs[:5]:
if para.level == 1:
lines.append(f"# {para.text}\n")
break
else:
lines.append(f"# {doc_content.file_path}\n")
# 元数据信息
lines.append("\n## 文档信息\n")
for key, value in doc_content.metadata.items():
if value:
lines.append(f"- **{key}**: {value}")
# 正文内容
lines.append("\n## 正文\n")
table_index = 0
for para in doc_content.paragraphs:
if para.level > 0:
# 标题
prefix = "#" * para.level
lines.append(f"\n{prefix} {para.text}\n")
elif para.is_list:
# 列表项
lines.append(f"- {para.text}")
else:
# 正文
lines.append(para.text)
# 添加表格
if doc_content.tables:
lines.append("\n## 表格\n")
for i, table in enumerate(doc_content.tables):
lines.append(f"\n### 表格 {i + 1}\n")
lines.append(table.markdown + "\n")
return "\n".join(lines)
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
doc_content = self.parse(file_path)
return doc_content.markdown_text
def parse_docx(file_path: str) -> DocxDocumentContent:
"""便捷函数解析Word文档"""
parser = DocxParser()
return parser.parse(file_path)
def parse_docx_to_markdown(file_path: str) -> str:
"""便捷函数解析Word并返回Markdown"""
parser = DocxParser()
return parser.parse_to_markdown(file_path)