Files

277 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Provide service-layer logic for docx parser."""
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import List, Dict, Optional
from dataclasses import dataclass, field
from loguru import logger
import re
# Keep service responsibilities explicit so downstream behavior stays predictable.
@dataclass
class DocxParagraph:
"""Represent the Docx Paragraph type."""
text: str
level: int = 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
is_list: bool = False
list_number: Optional[str] = None
@dataclass
class DocxTable:
"""Represent the Docx Table type."""
rows: List[List[str]]
markdown: str = ""
@dataclass
class DocxDocumentContent:
"""Represent the Docx Document Content type."""
file_path: str
paragraphs: List[DocxParagraph]
tables: List[DocxTable]
metadata: Dict[str, str] = field(default_factory=dict)
markdown_text: str = ""
class DocxParser:
"""Provide the Docx Parser parser."""
def __init__(self):
"""Initialize the Docx Parser instance."""
self.document = None
def parse(self, file_path: str) -> DocxDocumentContent:
"""Handle parse for the Docx Parser instance."""
logger.info(f"开始解析Word文档: {file_path}")
try:
self.document = Document(file_path)
doc_content = DocxDocumentContent(
file_path=file_path,
paragraphs=[],
tables=[]
)
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.metadata = self._extract_metadata()
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.paragraphs = self._extract_paragraphs()
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.tables = self._extract_tables()
# Keep service responsibilities explicit so downstream behavior stays predictable.
doc_content.markdown_text = self._generate_markdown(doc_content)
logger.success(f"Word文档解析完成{len(doc_content.paragraphs)}个段落")
return doc_content
except Exception as e:
logger.error(f"Word文档解析失败: {e}")
raise
def _extract_metadata(self) -> Dict[str, str]:
"""Handle extract metadata for this module for the Docx Parser instance."""
metadata = {}
try:
core_props = self.document.core_properties
metadata = {
"title": core_props.title or "",
"author": core_props.author or "",
"subject": core_props.subject or "",
"keywords": core_props.keywords or "",
"created": str(core_props.created) if core_props.created else "",
"modified": str(core_props.modified) if core_props.modified else "",
}
except Exception as e:
logger.warning(f"提取元数据失败: {e}")
return metadata
def _extract_paragraphs(self) -> List[DocxParagraph]:
"""Handle extract paragraphs for this module for the Docx Parser instance."""
paragraphs = []
for para in self.document.paragraphs:
text = para.text.strip()
if not text:
continue
# Keep service responsibilities explicit so downstream behavior stays predictable.
level = self._get_paragraph_level(para)
# Keep service responsibilities explicit so downstream behavior stays predictable.
is_list, list_number = self._detect_list_item(para)
paragraph = DocxParagraph(
text=text,
level=level,
is_list=is_list,
list_number=list_number
)
paragraphs.append(paragraph)
return paragraphs
def _get_paragraph_level(self, para) -> int:
"""Handle get paragraph level for this module for the Docx Parser instance."""
# Keep service responsibilities explicit so downstream behavior stays predictable.
style_name = para.style.name if para.style else ""
if "Heading" in style_name or "标题" in style_name:
# Keep service responsibilities explicit so downstream behavior stays predictable.
match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
if match:
level = int(match.group(1) or match.group(2))
return level
# Keep service responsibilities explicit so downstream behavior stays predictable.
# Keep service responsibilities explicit so downstream behavior stays predictable.
if para.paragraph_format:
# Keep service responsibilities explicit so downstream behavior stays predictable.
pass
# Keep service responsibilities explicit so downstream behavior stays predictable.
text = para.text.strip()
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
return 2
# Keep service responsibilities explicit so downstream behavior stays predictable.
elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
return 3
# Keep service responsibilities explicit so downstream behavior stays predictable.
elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
return 4
return 0 # Keep service responsibilities explicit so downstream behavior stays predictable.
def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
"""Handle detect list item for this module for the Docx Parser instance."""
text = para.text.strip()
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^[\d]+[.、)\]]\s', text):
match = re.match(r'^([\d]+[.、)\]])\s', text)
return True, match.group(1) if match else None
# Keep service responsibilities explicit so downstream behavior stays predictable.
if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
return True, match.group(1) if match else None
# Keep service responsibilities explicit so downstream behavior stays predictable.
if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
# Keep service responsibilities explicit so downstream behavior stays predictable.
pass
return False, None
def _extract_tables(self) -> List[DocxTable]:
"""Handle extract tables for this module for the Docx Parser instance."""
tables = []
for table in self.document.tables:
rows = []
for row in table.rows:
cells = []
for cell in row.cells:
cells.append(cell.text.strip())
rows.append(cells)
# Keep service responsibilities explicit so downstream behavior stays predictable.
markdown = self._table_to_markdown(rows)
table_content = DocxTable(rows=rows, markdown=markdown)
tables.append(table_content)
return tables
def _table_to_markdown(self, rows: List[List[str]]) -> str:
"""Handle table to markdown for this module for the Docx Parser instance."""
if not rows or len(rows) < 1:
return ""
lines = []
# Keep service responsibilities explicit so downstream behavior stays predictable.
if len(rows) >= 1:
header = rows[0]
lines.append("| " + " | ".join(cell for cell in header) + " |")
lines.append("| " + " | ".join("---" for _ in header) + " |")
# Keep service responsibilities explicit so downstream behavior stays predictable.
for row in rows[1:]:
lines.append("| " + " | ".join(cell for cell in row) + " |")
return "\n".join(lines)
def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
"""Handle generate markdown for this module for the Docx Parser instance."""
lines = []
# Keep service responsibilities explicit so downstream behavior stays predictable.
title = doc_content.metadata.get("title", "")
if title:
lines.append(f"# {title}\n")
else:
# Keep service responsibilities explicit so downstream behavior stays predictable.
for para in doc_content.paragraphs[:5]:
if para.level == 1:
lines.append(f"# {para.text}\n")
break
else:
lines.append(f"# {doc_content.file_path}\n")
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append("\n## 文档信息\n")
for key, value in doc_content.metadata.items():
if value:
lines.append(f"- **{key}**: {value}")
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append("\n## 正文\n")
table_index = 0
for para in doc_content.paragraphs:
if para.level > 0:
# Keep service responsibilities explicit so downstream behavior stays predictable.
prefix = "#" * para.level
lines.append(f"\n{prefix} {para.text}\n")
elif para.is_list:
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append(f"- {para.text}")
else:
# Keep service responsibilities explicit so downstream behavior stays predictable.
lines.append(para.text)
# Keep service responsibilities explicit so downstream behavior stays predictable.
if doc_content.tables:
lines.append("\n## 表格\n")
for i, table in enumerate(doc_content.tables):
lines.append(f"\n### 表格 {i + 1}\n")
lines.append(table.markdown + "\n")
return "\n".join(lines)
def parse_to_markdown(self, file_path: str) -> str:
"""Parse to markdown for the Docx Parser instance."""
doc_content = self.parse(file_path)
return doc_content.markdown_text
def parse_docx(file_path: str) -> DocxDocumentContent:
"""Parse docx."""
parser = DocxParser()
return parser.parse(file_path)
def parse_docx_to_markdown(file_path: str) -> str:
"""Parse docx to markdown."""
parser = DocxParser()
return parser.parse_to_markdown(file_path)