2026-05-18 16:32:42 +08:00
|
|
|
|
"""Provide service-layer logic for docx parser."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
|
|
|
|
from typing import List, Dict, Optional
|
|
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
import re
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
|
|
|
|
|
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class DocxParagraph:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Represent the Docx Paragraph type."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
text: str
|
2026-05-18 16:32:42 +08:00
|
|
|
|
level: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
is_list: bool = False
|
|
|
|
|
|
list_number: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class DocxTable:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Represent the Docx Table type."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
rows: List[List[str]]
|
|
|
|
|
|
markdown: str = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class DocxDocumentContent:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Represent the Docx Document Content type."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
file_path: str
|
|
|
|
|
|
paragraphs: List[DocxParagraph]
|
|
|
|
|
|
tables: List[DocxTable]
|
|
|
|
|
|
metadata: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
markdown_text: str = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocxParser:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Provide the Docx Parser parser."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Initialize the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
self.document = None
|
|
|
|
|
|
|
|
|
|
|
|
def parse(self, file_path: str) -> DocxDocumentContent:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle parse for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
logger.info(f"开始解析Word文档: {file_path}")
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.document = Document(file_path)
|
|
|
|
|
|
doc_content = DocxDocumentContent(
|
|
|
|
|
|
file_path=file_path,
|
|
|
|
|
|
paragraphs=[],
|
|
|
|
|
|
tables=[]
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
doc_content.metadata = self._extract_metadata()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
doc_content.paragraphs = self._extract_paragraphs()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
doc_content.tables = self._extract_tables()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
doc_content.markdown_text = self._generate_markdown(doc_content)
|
|
|
|
|
|
|
|
|
|
|
|
logger.success(f"Word文档解析完成,共{len(doc_content.paragraphs)}个段落")
|
|
|
|
|
|
|
|
|
|
|
|
return doc_content
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"Word文档解析失败: {e}")
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_metadata(self) -> Dict[str, str]:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle extract metadata for this module for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
metadata = {}
|
|
|
|
|
|
try:
|
|
|
|
|
|
core_props = self.document.core_properties
|
|
|
|
|
|
metadata = {
|
|
|
|
|
|
"title": core_props.title or "",
|
|
|
|
|
|
"author": core_props.author or "",
|
|
|
|
|
|
"subject": core_props.subject or "",
|
|
|
|
|
|
"keywords": core_props.keywords or "",
|
|
|
|
|
|
"created": str(core_props.created) if core_props.created else "",
|
|
|
|
|
|
"modified": str(core_props.modified) if core_props.modified else "",
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"提取元数据失败: {e}")
|
|
|
|
|
|
return metadata
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_paragraphs(self) -> List[DocxParagraph]:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle extract paragraphs for this module for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
paragraphs = []
|
|
|
|
|
|
|
|
|
|
|
|
for para in self.document.paragraphs:
|
|
|
|
|
|
text = para.text.strip()
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
level = self._get_paragraph_level(para)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
is_list, list_number = self._detect_list_item(para)
|
|
|
|
|
|
|
|
|
|
|
|
paragraph = DocxParagraph(
|
|
|
|
|
|
text=text,
|
|
|
|
|
|
level=level,
|
|
|
|
|
|
is_list=is_list,
|
|
|
|
|
|
list_number=list_number
|
|
|
|
|
|
)
|
|
|
|
|
|
paragraphs.append(paragraph)
|
|
|
|
|
|
|
|
|
|
|
|
return paragraphs
|
|
|
|
|
|
|
|
|
|
|
|
def _get_paragraph_level(self, para) -> int:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle get paragraph level for this module for the Docx Parser instance."""
|
|
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
style_name = para.style.name if para.style else ""
|
|
|
|
|
|
|
|
|
|
|
|
if "Heading" in style_name or "标题" in style_name:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
|
|
|
|
|
|
if match:
|
|
|
|
|
|
level = int(match.group(1) or match.group(2))
|
|
|
|
|
|
return level
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
|
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if para.paragraph_format:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
pass
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
text = para.text.strip()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
|
|
|
|
|
|
return 2
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
|
|
|
|
|
|
return 3
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
|
|
|
|
|
|
return 4
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
return 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
|
|
|
|
|
|
def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle detect list item for this module for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
text = para.text.strip()
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if re.match(r'^[\d]+[.、)\]]\s', text):
|
|
|
|
|
|
match = re.match(r'^([\d]+[.、)\]])\s', text)
|
|
|
|
|
|
return True, match.group(1) if match else None
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
|
|
|
|
|
|
match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
|
|
|
|
|
|
return True, match.group(1) if match else None
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
return False, None
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_tables(self) -> List[DocxTable]:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle extract tables for this module for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
tables = []
|
|
|
|
|
|
|
|
|
|
|
|
for table in self.document.tables:
|
|
|
|
|
|
rows = []
|
|
|
|
|
|
for row in table.rows:
|
|
|
|
|
|
cells = []
|
|
|
|
|
|
for cell in row.cells:
|
|
|
|
|
|
cells.append(cell.text.strip())
|
|
|
|
|
|
rows.append(cells)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
markdown = self._table_to_markdown(rows)
|
|
|
|
|
|
|
|
|
|
|
|
table_content = DocxTable(rows=rows, markdown=markdown)
|
|
|
|
|
|
tables.append(table_content)
|
|
|
|
|
|
|
|
|
|
|
|
return tables
|
|
|
|
|
|
|
|
|
|
|
|
def _table_to_markdown(self, rows: List[List[str]]) -> str:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle table to markdown for this module for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if not rows or len(rows) < 1:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
lines = []
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if len(rows) >= 1:
|
|
|
|
|
|
header = rows[0]
|
|
|
|
|
|
lines.append("| " + " | ".join(cell for cell in header) + " |")
|
|
|
|
|
|
lines.append("| " + " | ".join("---" for _ in header) + " |")
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
for row in rows[1:]:
|
|
|
|
|
|
lines.append("| " + " | ".join(cell for cell in row) + " |")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Handle generate markdown for this module for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
lines = []
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
title = doc_content.metadata.get("title", "")
|
|
|
|
|
|
if title:
|
|
|
|
|
|
lines.append(f"# {title}\n")
|
|
|
|
|
|
else:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
for para in doc_content.paragraphs[:5]:
|
|
|
|
|
|
if para.level == 1:
|
|
|
|
|
|
lines.append(f"# {para.text}\n")
|
|
|
|
|
|
break
|
|
|
|
|
|
else:
|
|
|
|
|
|
lines.append(f"# {doc_content.file_path}\n")
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
lines.append("\n## 文档信息\n")
|
|
|
|
|
|
for key, value in doc_content.metadata.items():
|
|
|
|
|
|
if value:
|
|
|
|
|
|
lines.append(f"- **{key}**: {value}")
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
lines.append("\n## 正文\n")
|
|
|
|
|
|
|
|
|
|
|
|
table_index = 0
|
|
|
|
|
|
for para in doc_content.paragraphs:
|
|
|
|
|
|
if para.level > 0:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
prefix = "#" * para.level
|
|
|
|
|
|
lines.append(f"\n{prefix} {para.text}\n")
|
|
|
|
|
|
elif para.is_list:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
lines.append(f"- {para.text}")
|
|
|
|
|
|
else:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
lines.append(para.text)
|
|
|
|
|
|
|
2026-05-18 16:32:42 +08:00
|
|
|
|
# Keep service responsibilities explicit so downstream behavior stays predictable.
|
2026-05-14 15:07:34 +08:00
|
|
|
|
if doc_content.tables:
|
|
|
|
|
|
lines.append("\n## 表格\n")
|
|
|
|
|
|
for i, table in enumerate(doc_content.tables):
|
|
|
|
|
|
lines.append(f"\n### 表格 {i + 1}\n")
|
|
|
|
|
|
lines.append(table.markdown + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_to_markdown(self, file_path: str) -> str:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Parse to markdown for the Docx Parser instance."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
doc_content = self.parse(file_path)
|
|
|
|
|
|
return doc_content.markdown_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_docx(file_path: str) -> DocxDocumentContent:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Parse docx."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
parser = DocxParser()
|
|
|
|
|
|
return parser.parse(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_docx_to_markdown(file_path: str) -> str:
|
2026-05-18 16:32:42 +08:00
|
|
|
|
"""Parse docx to markdown."""
|
2026-05-14 15:07:34 +08:00
|
|
|
|
parser = DocxParser()
|
2026-05-14 18:09:15 +08:00
|
|
|
|
return parser.parse_to_markdown(file_path)
|