"""Provide service-layer logic for docx parser.""" from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from typing import List, Dict, Optional from dataclasses import dataclass, field from loguru import logger import re # Keep service responsibilities explicit so downstream behavior stays predictable. @dataclass class DocxParagraph: """Represent the Docx Paragraph type.""" text: str level: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable. is_list: bool = False list_number: Optional[str] = None @dataclass class DocxTable: """Represent the Docx Table type.""" rows: List[List[str]] markdown: str = "" @dataclass class DocxDocumentContent: """Represent the Docx Document Content type.""" file_path: str paragraphs: List[DocxParagraph] tables: List[DocxTable] metadata: Dict[str, str] = field(default_factory=dict) markdown_text: str = "" class DocxParser: """Provide the Docx Parser parser.""" def __init__(self): """Initialize the Docx Parser instance.""" self.document = None def parse(self, file_path: str) -> DocxDocumentContent: """Handle parse for the Docx Parser instance.""" logger.info(f"开始解析Word文档: {file_path}") try: self.document = Document(file_path) doc_content = DocxDocumentContent( file_path=file_path, paragraphs=[], tables=[] ) # Keep service responsibilities explicit so downstream behavior stays predictable. doc_content.metadata = self._extract_metadata() # Keep service responsibilities explicit so downstream behavior stays predictable. doc_content.paragraphs = self._extract_paragraphs() # Keep service responsibilities explicit so downstream behavior stays predictable. doc_content.tables = self._extract_tables() # Keep service responsibilities explicit so downstream behavior stays predictable. doc_content.markdown_text = self._generate_markdown(doc_content) logger.success(f"Word文档解析完成,共{len(doc_content.paragraphs)}个段落") return doc_content except Exception as e: logger.error(f"Word文档解析失败: {e}") raise def _extract_metadata(self) -> Dict[str, str]: """Handle extract metadata for this module for the Docx Parser instance.""" metadata = {} try: core_props = self.document.core_properties metadata = { "title": core_props.title or "", "author": core_props.author or "", "subject": core_props.subject or "", "keywords": core_props.keywords or "", "created": str(core_props.created) if core_props.created else "", "modified": str(core_props.modified) if core_props.modified else "", } except Exception as e: logger.warning(f"提取元数据失败: {e}") return metadata def _extract_paragraphs(self) -> List[DocxParagraph]: """Handle extract paragraphs for this module for the Docx Parser instance.""" paragraphs = [] for para in self.document.paragraphs: text = para.text.strip() if not text: continue # Keep service responsibilities explicit so downstream behavior stays predictable. level = self._get_paragraph_level(para) # Keep service responsibilities explicit so downstream behavior stays predictable. is_list, list_number = self._detect_list_item(para) paragraph = DocxParagraph( text=text, level=level, is_list=is_list, list_number=list_number ) paragraphs.append(paragraph) return paragraphs def _get_paragraph_level(self, para) -> int: """Handle get paragraph level for this module for the Docx Parser instance.""" # Keep service responsibilities explicit so downstream behavior stays predictable. style_name = para.style.name if para.style else "" if "Heading" in style_name or "标题" in style_name: # Keep service responsibilities explicit so downstream behavior stays predictable. match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name) if match: level = int(match.group(1) or match.group(2)) return level # Keep service responsibilities explicit so downstream behavior stays predictable. # Keep service responsibilities explicit so downstream behavior stays predictable. if para.paragraph_format: # Keep service responsibilities explicit so downstream behavior stays predictable. pass # Keep service responsibilities explicit so downstream behavior stays predictable. text = para.text.strip() # Keep service responsibilities explicit so downstream behavior stays predictable. if re.match(r'^第[一二三四五六七八九十百]+章\s', text): return 2 # Keep service responsibilities explicit so downstream behavior stays predictable. elif re.match(r'^第[一二三四五六七八九十百]+节\s', text): return 3 # Keep service responsibilities explicit so downstream behavior stays predictable. elif re.match(r'^第[一二三四五六七八九十百]+条\s', text): return 4 return 0 # Keep service responsibilities explicit so downstream behavior stays predictable. def _detect_list_item(self, para) -> tuple[bool, Optional[str]]: """Handle detect list item for this module for the Docx Parser instance.""" text = para.text.strip() # Keep service responsibilities explicit so downstream behavior stays predictable. if re.match(r'^[\d]+[.、)\]]\s', text): match = re.match(r'^([\d]+[.、)\]])\s', text) return True, match.group(1) if match else None # Keep service responsibilities explicit so downstream behavior stays predictable. if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text): match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text) return True, match.group(1) if match else None # Keep service responsibilities explicit so downstream behavior stays predictable. if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'): # Keep service responsibilities explicit so downstream behavior stays predictable. pass return False, None def _extract_tables(self) -> List[DocxTable]: """Handle extract tables for this module for the Docx Parser instance.""" tables = [] for table in self.document.tables: rows = [] for row in table.rows: cells = [] for cell in row.cells: cells.append(cell.text.strip()) rows.append(cells) # Keep service responsibilities explicit so downstream behavior stays predictable. markdown = self._table_to_markdown(rows) table_content = DocxTable(rows=rows, markdown=markdown) tables.append(table_content) return tables def _table_to_markdown(self, rows: List[List[str]]) -> str: """Handle table to markdown for this module for the Docx Parser instance.""" if not rows or len(rows) < 1: return "" lines = [] # Keep service responsibilities explicit so downstream behavior stays predictable. if len(rows) >= 1: header = rows[0] lines.append("| " + " | ".join(cell for cell in header) + " |") lines.append("| " + " | ".join("---" for _ in header) + " |") # Keep service responsibilities explicit so downstream behavior stays predictable. for row in rows[1:]: lines.append("| " + " | ".join(cell for cell in row) + " |") return "\n".join(lines) def _generate_markdown(self, doc_content: DocxDocumentContent) -> str: """Handle generate markdown for this module for the Docx Parser instance.""" lines = [] # Keep service responsibilities explicit so downstream behavior stays predictable. title = doc_content.metadata.get("title", "") if title: lines.append(f"# {title}\n") else: # Keep service responsibilities explicit so downstream behavior stays predictable. for para in doc_content.paragraphs[:5]: if para.level == 1: lines.append(f"# {para.text}\n") break else: lines.append(f"# {doc_content.file_path}\n") # Keep service responsibilities explicit so downstream behavior stays predictable. lines.append("\n## 文档信息\n") for key, value in doc_content.metadata.items(): if value: lines.append(f"- **{key}**: {value}") # Keep service responsibilities explicit so downstream behavior stays predictable. lines.append("\n## 正文\n") table_index = 0 for para in doc_content.paragraphs: if para.level > 0: # Keep service responsibilities explicit so downstream behavior stays predictable. prefix = "#" * para.level lines.append(f"\n{prefix} {para.text}\n") elif para.is_list: # Keep service responsibilities explicit so downstream behavior stays predictable. lines.append(f"- {para.text}") else: # Keep service responsibilities explicit so downstream behavior stays predictable. lines.append(para.text) # Keep service responsibilities explicit so downstream behavior stays predictable. if doc_content.tables: lines.append("\n## 表格\n") for i, table in enumerate(doc_content.tables): lines.append(f"\n### 表格 {i + 1}\n") lines.append(table.markdown + "\n") return "\n".join(lines) def parse_to_markdown(self, file_path: str) -> str: """Parse to markdown for the Docx Parser instance.""" doc_content = self.parse(file_path) return doc_content.markdown_text def parse_docx(file_path: str) -> DocxDocumentContent: """Parse docx.""" parser = DocxParser() return parser.parse(file_path) def parse_docx_to_markdown(file_path: str) -> str: """Parse docx to markdown.""" parser = DocxParser() return parser.parse_to_markdown(file_path)