AIRegulation-DocAnalysis/backend/app/services/parser/docx_parser.py

"""Provide service-layer logic for docx parser."""

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from typing import List, Dict, Optional
from dataclasses import dataclass, field
from loguru import logger
import re
# Keep service responsibilities explicit so downstream behavior stays predictable.


@dataclass
class DocxParagraph:
    """Represent the Docx Paragraph type."""
    text: str
    level: int = 0  # Keep service responsibilities explicit so downstream behavior stays predictable.
    is_list: bool = False
    list_number: Optional[str] = None


@dataclass
class DocxTable:
    """Represent the Docx Table type."""
    rows: List[List[str]]
    markdown: str = ""


@dataclass
class DocxDocumentContent:
    """Represent the Docx Document Content type."""
    file_path: str
    paragraphs: List[DocxParagraph]
    tables: List[DocxTable]
    metadata: Dict[str, str] = field(default_factory=dict)
    markdown_text: str = ""


class DocxParser:
    """Provide the Docx Parser parser."""

    def __init__(self):
        """Initialize the Docx Parser instance."""
        self.document = None

    def parse(self, file_path: str) -> DocxDocumentContent:
        """Handle parse for the Docx Parser instance."""
        logger.info(f"开始解析Word文档: {file_path}")

        try:
            self.document = Document(file_path)
            doc_content = DocxDocumentContent(
                file_path=file_path,
                paragraphs=[],
                tables=[]
            )

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            doc_content.metadata = self._extract_metadata()

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            doc_content.paragraphs = self._extract_paragraphs()

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            doc_content.tables = self._extract_tables()

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            doc_content.markdown_text = self._generate_markdown(doc_content)

            logger.success(f"Word文档解析完成，共{len(doc_content.paragraphs)}个段落")

            return doc_content

        except Exception as e:
            logger.error(f"Word文档解析失败: {e}")
            raise

    def _extract_metadata(self) -> Dict[str, str]:
        """Handle extract metadata for this module for the Docx Parser instance."""
        metadata = {}
        try:
            core_props = self.document.core_properties
            metadata = {
                "title": core_props.title or "",
                "author": core_props.author or "",
                "subject": core_props.subject or "",
                "keywords": core_props.keywords or "",
                "created": str(core_props.created) if core_props.created else "",
                "modified": str(core_props.modified) if core_props.modified else "",
            }
        except Exception as e:
            logger.warning(f"提取元数据失败: {e}")
        return metadata

    def _extract_paragraphs(self) -> List[DocxParagraph]:
        """Handle extract paragraphs for this module for the Docx Parser instance."""
        paragraphs = []

        for para in self.document.paragraphs:
            text = para.text.strip()
            if not text:
                continue

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            level = self._get_paragraph_level(para)

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            is_list, list_number = self._detect_list_item(para)

            paragraph = DocxParagraph(
                text=text,
                level=level,
                is_list=is_list,
                list_number=list_number
            )
            paragraphs.append(paragraph)

        return paragraphs

    def _get_paragraph_level(self, para) -> int:
        """Handle get paragraph level for this module for the Docx Parser instance."""
        # Keep service responsibilities explicit so downstream behavior stays predictable.
        style_name = para.style.name if para.style else ""

        if "Heading" in style_name or "标题" in style_name:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            match = re.search(r'Heading\s*(\d)|标题\s*(\d)', style_name)
            if match:
                level = int(match.group(1) or match.group(2))
                return level

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if para.paragraph_format:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            pass

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        text = para.text.strip()

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if re.match(r'^第[一二三四五六七八九十百]+章\s', text):
            return 2
        # Keep service responsibilities explicit so downstream behavior stays predictable.
        elif re.match(r'^第[一二三四五六七八九十百]+节\s', text):
            return 3
        # Keep service responsibilities explicit so downstream behavior stays predictable.
        elif re.match(r'^第[一二三四五六七八九十百]+条\s', text):
            return 4

        return 0  # Keep service responsibilities explicit so downstream behavior stays predictable.

    def _detect_list_item(self, para) -> tuple[bool, Optional[str]]:
        """Handle detect list item for this module for the Docx Parser instance."""
        text = para.text.strip()

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if re.match(r'^[\d]+[.、)\]]\s', text):
            match = re.match(r'^([\d]+[.、)\]])\s', text)
            return True, match.group(1) if match else None

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if re.match(r'^[一二三四五六七八九十]+[、.)]\s', text):
            match = re.match(r'^([一二三四五六七八九十]+[、.)])\s', text)
            return True, match.group(1) if match else None

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if para.paragraph_format and hasattr(para.paragraph_format, 'left_indent'):
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            pass

        return False, None

    def _extract_tables(self) -> List[DocxTable]:
        """Handle extract tables for this module for the Docx Parser instance."""
        tables = []

        for table in self.document.tables:
            rows = []
            for row in table.rows:
                cells = []
                for cell in row.cells:
                    cells.append(cell.text.strip())
                rows.append(cells)

            # Keep service responsibilities explicit so downstream behavior stays predictable.
            markdown = self._table_to_markdown(rows)

            table_content = DocxTable(rows=rows, markdown=markdown)
            tables.append(table_content)

        return tables

    def _table_to_markdown(self, rows: List[List[str]]) -> str:
        """Handle table to markdown for this module for the Docx Parser instance."""
        if not rows or len(rows) < 1:
            return ""

        lines = []

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if len(rows) >= 1:
            header = rows[0]
            lines.append("| " + " | ".join(cell for cell in header) + " |")
            lines.append("| " + " | ".join("---" for _ in header) + " |")

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        for row in rows[1:]:
            lines.append("| " + " | ".join(cell for cell in row) + " |")

        return "\n".join(lines)

    def _generate_markdown(self, doc_content: DocxDocumentContent) -> str:
        """Handle generate markdown for this module for the Docx Parser instance."""
        lines = []

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        title = doc_content.metadata.get("title", "")
        if title:
            lines.append(f"# {title}\n")
        else:
            # Keep service responsibilities explicit so downstream behavior stays predictable.
            for para in doc_content.paragraphs[:5]:
                if para.level == 1:
                    lines.append(f"# {para.text}\n")
                    break
            else:
                lines.append(f"# {doc_content.file_path}\n")

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        lines.append("\n## 文档信息\n")
        for key, value in doc_content.metadata.items():
            if value:
                lines.append(f"- **{key}**: {value}")

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        lines.append("\n## 正文\n")

        table_index = 0
        for para in doc_content.paragraphs:
            if para.level > 0:
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                prefix = "#" * para.level
                lines.append(f"\n{prefix} {para.text}\n")
            elif para.is_list:
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                lines.append(f"- {para.text}")
            else:
                # Keep service responsibilities explicit so downstream behavior stays predictable.
                lines.append(para.text)

        # Keep service responsibilities explicit so downstream behavior stays predictable.
        if doc_content.tables:
            lines.append("\n## 表格\n")
            for i, table in enumerate(doc_content.tables):
                lines.append(f"\n### 表格 {i + 1}\n")
                lines.append(table.markdown + "\n")

        return "\n".join(lines)

    def parse_to_markdown(self, file_path: str) -> str:
        """Parse to markdown for the Docx Parser instance."""
        doc_content = self.parse(file_path)
        return doc_content.markdown_text


def parse_docx(file_path: str) -> DocxDocumentContent:
    """Parse docx."""
    parser = DocxParser()
    return parser.parse(file_path)


def parse_docx_to_markdown(file_path: str) -> str:
    """Parse docx to markdown."""
    parser = DocxParser()
    return parser.parse_to_markdown(file_path)