Files
AIRegulation-DocAnalysis/backend/app/services/parser/docx_parser.py
2026-05-14 15:07:34 +08:00

287 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# src/services/parser/docx_parser.py
"""Word文档解析 - 使用python-docx"""
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import List, Dict, Optional
from dataclasses import dataclass, field
from loguru import logger
import re
@dataclass
class DocxParagraph:
"""段落内容"""
text: str
level: int = 0 # 标题级别0表示正文
is_list: bool = False
list_number: Optional[str] = None
@dataclass
class DocxTable:
"""表格内容"""
rows: List[List[str]]
markdown: str = ""
@dataclass
class DocxDocumentContent:
"""Word文档完整内容"""
file_path: str
paragraphs: List[DocxParagraph]
tables: List[DocxTable]
metadata: Dict[str, str] = field(default_factory=dict)
markdown_text: str = ""
class DocxParser:
"""Word文档解析器 - 基于python-docx"""
def __init__(self):
self.document = None
def parse(self, file_path: str) -> DocxDocumentContent:
"""
解析Word文档
Args:
file_path: Word文档路径
Returns:
DocxDocumentContent: 解析后的文档内容
"""
logger.info(f"开始解析Word文档: {file_path}")
try:
self.document = Document(file_path)
doc_content = DocxDocumentContent(
file_path=file_path,
paragraphs=[],
tables=[]
)
# 提取文档元数据
doc_content.metadata = self._extract_metadata()
# 提取段落
doc_content.paragraphs = self._extract_paragraphs()
# 提取表格
doc_content.tables = self._extract_tables()
# 生成Markdown格式文本
doc_content.markdown_text = self._generate_markdown(doc_content)
logger.success(f"Word文档解析完成{len(doc_content.paragraphs)}个段落")
return doc_content
except Exception as e:
logger.error(f"Word文档解析失败: {e}")
raise
def _extract_metadata(self) -> Dict[str, str]:
"""提取文档元数据"""
metadata = {}
try:
core_props = self.document.core_properties
metadata = {
"title": core_props.title or "",
"author": core_props.author or "",
"subject": core_props.subject or "",
"keywords": core_props.keywords or "",
"created": str(core_props.created) if core_props.created else "",
"modified": str(core_props.modified) if core_props.modified else "",
}
except Exception as e:
logger.warning(f"提取元数据失败: {e}")
return metadata
def _extract_paragraphs(self) -> List[DocxParagraph]:
"""提取所有段落"""
paragraphs = []
for para in self.document.paragraphs:
text = para.text.strip()
if not text:
continue
# 判断标题级别
level = self._get_paragraph_level(para)
# 判断是否是列表项
is_list, list_number = self._detect_list_item(para)
paragraph = DocxParagraph(
text=text,
level=level,
is_list=is_list,
list_number=list_number
)
paragraphs.append(paragraph)
return paragraphs
def _get_paragraph_level(self, para) -> int:
"""
判断段落标题级别
Returns:
int: 标题级别0表示正文
"""
# 方法1检查段落样式
style_name = para.style.name if para.style else ""
if "Heading" in style_name or "标题" in style_name:
# 从样式名称中提取级别
match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
if match:
level = int(match.group(1) or match.group(2))
return level
# 方法2检查段落格式字号
# 标题通常字号较大
if para.paragraph_format:
# 可以根据字号判断,这里简化处理
pass
# 方法3根据内容模式判断法规文档特征
text = para.text.strip()
# 第一章、第X章 -> 二级标题
if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
return 2
# 第X节 -> 三级标题
elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
return 3
# 第X条 -> 四级标题
elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
return 4
return 0 # 正文
def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
"""检测是否是列表项"""
text = para.text.strip()
# 数字列表1.、2.、1、[1]等
if re.match(r'^[\d]+[.、)\]]\s', text):
match = re.match(r'^([\d]+[.、)\]])\s', text)
return True, match.group(1) if match else None
# 中文数字列表:一、二、(一)等
if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
return True, match.group(1) if match else None
# 检查段落格式中的列表编号
if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
# 有缩进的可能是列表项
pass
return False, None
def _extract_tables(self) -> List[DocxTable]:
"""提取所有表格"""
tables = []
for table in self.document.tables:
rows = []
for row in table.rows:
cells = []
for cell in row.cells:
cells.append(cell.text.strip())
rows.append(cells)
# 转换为Markdown表格
markdown = self._table_to_markdown(rows)
table_content = DocxTable(rows=rows, markdown=markdown)
tables.append(table_content)
return tables
def _table_to_markdown(self, rows: List[List[str]]) -> str:
"""将表格转换为Markdown格式"""
if not rows or len(rows) < 1:
return ""
lines = []
# 表头
if len(rows) >= 1:
header = rows[0]
lines.append("| " + " | ".join(cell for cell in header) + " |")
lines.append("| " + " | ".join("---" for _ in header) + " |")
# 数据行
for row in rows[1:]:
lines.append("| " + " | ".join(cell for cell in row) + " |")
return "\n".join(lines)
def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
"""生成Markdown格式文本"""
lines = []
# 文档标题
title = doc_content.metadata.get("title", "")
if title:
lines.append(f"# {title}\n")
else:
# 从第一个段落获取标题(如果是标题样式)
for para in doc_content.paragraphs[:5]:
if para.level == 1:
lines.append(f"# {para.text}\n")
break
else:
lines.append(f"# {doc_content.file_path}\n")
# 元数据信息
lines.append("\n## 文档信息\n")
for key, value in doc_content.metadata.items():
if value:
lines.append(f"- **{key}**: {value}")
# 正文内容
lines.append("\n## 正文\n")
table_index = 0
for para in doc_content.paragraphs:
if para.level > 0:
# 标题
prefix = "#" * para.level
lines.append(f"\n{prefix} {para.text}\n")
elif para.is_list:
# 列表项
lines.append(f"- {para.text}")
else:
# 正文
lines.append(para.text)
# 添加表格
if doc_content.tables:
lines.append("\n## 表格\n")
for i, table in enumerate(doc_content.tables):
lines.append(f"\n### 表格 {i + 1}\n")
lines.append(table.markdown + "\n")
return "\n".join(lines)
def parse_to_markdown(self, file_path: str) -> str:
"""直接解析并返回Markdown文本"""
doc_content = self.parse(file_path)
return doc_content.markdown_text
def parse_docx(file_path: str) -> DocxDocumentContent:
"""便捷函数解析Word文档"""
parser = DocxParser()
return parser.parse(file_path)
def parse_docx_to_markdown(file_path: str) -> str:
"""便捷函数解析Word并返回Markdown"""
parser = DocxParser()
return parser.parse_to_markdown(file_path)