# src/services/parser/docx_parser.py """Word文档解析 - 使用python-docx""" from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from typing import List, Dict, Optional from dataclasses import dataclass, field from loguru import logger import re @dataclass class DocxParagraph: """段落内容""" text: str level: int = 0 # 标题级别,0表示正文 is_list: bool = False list_number: Optional[str] = None @dataclass class DocxTable: """表格内容""" rows: List[List[str]] markdown: str = "" @dataclass class DocxDocumentContent: """Word文档完整内容""" file_path: str paragraphs: List[DocxParagraph] tables: List[DocxTable] metadata: Dict[str, str] = field(default_factory=dict) markdown_text: str = "" class DocxParser: """Word文档解析器 - 基于python-docx""" def __init__(self): self.document = None def parse(self, file_path: str) -> DocxDocumentContent: """ 解析Word文档 Args: file_path: Word文档路径 Returns: DocxDocumentContent: 解析后的文档内容 """ logger.info(f"开始解析Word文档: {file_path}") try: self.document = Document(file_path) doc_content = DocxDocumentContent( file_path=file_path, paragraphs=[], tables=[] ) # 提取文档元数据 doc_content.metadata = self._extract_metadata() # 提取段落 doc_content.paragraphs = self._extract_paragraphs() # 提取表格 doc_content.tables = self._extract_tables() # 生成Markdown格式文本 doc_content.markdown_text = self._generate_markdown(doc_content) logger.success(f"Word文档解析完成,共{len(doc_content.paragraphs)}个段落") return doc_content except Exception as e: logger.error(f"Word文档解析失败: {e}") raise def _extract_metadata(self) -> Dict[str, str]: """提取文档元数据""" metadata = {} try: core_props = self.document.core_properties metadata = { "title": core_props.title or "", "author": core_props.author or "", "subject": core_props.subject or "", "keywords": core_props.keywords or "", "created": str(core_props.created) if core_props.created else "", "modified": str(core_props.modified) if core_props.modified else "", } except Exception as e: logger.warning(f"提取元数据失败: {e}") return metadata def _extract_paragraphs(self) -> List[DocxParagraph]: """提取所有段落""" paragraphs = [] for para in self.document.paragraphs: text = para.text.strip() if not text: continue # 判断标题级别 level = self._get_paragraph_level(para) # 判断是否是列表项 is_list, list_number = self._detect_list_item(para) paragraph = DocxParagraph( text=text, level=level, is_list=is_list, list_number=list_number ) paragraphs.append(paragraph) return paragraphs def _get_paragraph_level(self, para) -> int: """ 判断段落标题级别 Returns: int: 标题级别,0表示正文 """ # 方法1:检查段落样式 style_name = para.style.name if para.style else "" if "Heading" in style_name or "标题" in style_name: # 从样式名称中提取级别 match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name) if match: level = int(match.group(1) or match.group(2)) return level # 方法2:检查段落格式(字号) # 标题通常字号较大 if para.paragraph_format: # 可以根据字号判断,这里简化处理 pass # 方法3:根据内容模式判断(法规文档特征) text = para.text.strip() # 第一章、第X章 -> 二级标题 if re.match(r'^第[一二三四五六七八九十百]+章\s', text): return 2 # 第X节 -> 三级标题 elif re.match(r'^第[一二三四五六七八九十百]+节\s', text): return 3 # 第X条 -> 四级标题 elif re.match(r'^第[一二三四五六七八九十百]+条\s', text): return 4 return 0 # 正文 def _detect_list_item(self, para) -> tuple[bool, Optional[str]]: """检测是否是列表项""" text = para.text.strip() # 数字列表:1.、2.、(1)、[1]等 if re.match(r'^[\d]+[.、)\]]\s', text): match = re.match(r'^([\d]+[.、)\]])\s', text) return True, match.group(1) if match else None # 中文数字列表:一、二、(一)等 if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text): match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text) return True, match.group(1) if match else None # 检查段落格式中的列表编号 if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'): # 有缩进的可能是列表项 pass return False, None def _extract_tables(self) -> List[DocxTable]: """提取所有表格""" tables = [] for table in self.document.tables: rows = [] for row in table.rows: cells = [] for cell in row.cells: cells.append(cell.text.strip()) rows.append(cells) # 转换为Markdown表格 markdown = self._table_to_markdown(rows) table_content = DocxTable(rows=rows, markdown=markdown) tables.append(table_content) return tables def _table_to_markdown(self, rows: List[List[str]]) -> str: """将表格转换为Markdown格式""" if not rows or len(rows) < 1: return "" lines = [] # 表头 if len(rows) >= 1: header = rows[0] lines.append("| " + " | ".join(cell for cell in header) + " |") lines.append("| " + " | ".join("---" for _ in header) + " |") # 数据行 for row in rows[1:]: lines.append("| " + " | ".join(cell for cell in row) + " |") return "\n".join(lines) def _generate_markdown(self, doc_content: DocxDocumentContent) -> str: """生成Markdown格式文本""" lines = [] # 文档标题 title = doc_content.metadata.get("title", "") if title: lines.append(f"# {title}\n") else: # 从第一个段落获取标题(如果是标题样式) for para in doc_content.paragraphs[:5]: if para.level == 1: lines.append(f"# {para.text}\n") break else: lines.append(f"# {doc_content.file_path}\n") # 元数据信息 lines.append("\n## 文档信息\n") for key, value in doc_content.metadata.items(): if value: lines.append(f"- **{key}**: {value}") # 正文内容 lines.append("\n## 正文\n") table_index = 0 for para in doc_content.paragraphs: if para.level > 0: # 标题 prefix = "#" * para.level lines.append(f"\n{prefix} {para.text}\n") elif para.is_list: # 列表项 lines.append(f"- {para.text}") else: # 正文 lines.append(para.text) # 添加表格 if doc_content.tables: lines.append("\n## 表格\n") for i, table in enumerate(doc_content.tables): lines.append(f"\n### 表格 {i + 1}\n") lines.append(table.markdown + "\n") return "\n".join(lines) def parse_to_markdown(self, file_path: str) -> str: """直接解析并返回Markdown文本""" doc_content = self.parse(file_path) return doc_content.markdown_text def parse_docx(file_path: str) -> DocxDocumentContent: """便捷函数:解析Word文档""" parser = DocxParser() return parser.parse(file_path) def parse_docx_to_markdown(file_path: str) -> str: """便捷函数:解析Word并返回Markdown""" parser = DocxParser() return parser.parse_to_markdown(file_path)