AIRegulation-DocAnalysis/tests/test_parser.py

# tests/test_parser.py
"""文档解析测试"""

import pytest
from loguru import logger
import sys
import os

PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))

from app.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
from app.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
from app.services.parser.mineru_parser import MinerUParser, ParserOrchestrator


class TestPDFParser:
    """PDF解析测试"""

    def test_parser_initialization(self):
        """测试PDF解析器初始化"""
        parser = PDFParser()
        assert parser is not None

    def test_parse_sample_pdf(self):
        """测试解析示例PDF（如果有）"""
        # 如果有示例PDF文件，可以在此测试
        sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")

        if os.path.exists(sample_pdf):
            parser = PDFParser()
            result = parser.parse(sample_pdf)

            assert result.total_pages > 0
            assert len(result.pages) > 0
            assert len(result.markdown_text) > 0


class TestDocxParser:
    """Word文档解析测试"""

    def test_parser_initialization(self):
        """测试Word解析器初始化"""
        parser = DocxParser()
        assert parser is not None

    def test_parse_sample_docx(self):
        """测试解析示例DOCX"""
        sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")

        if os.path.exists(sample_docx):
            parser = DocxParser()
            result = parser.parse(sample_docx)

            assert len(result.paragraphs) > 0
            assert len(result.markdown_text) > 0


class TestChunker:
    """分块器测试"""

    def test_chunker_initialization(self):
        """测试分块器初始化"""
        from app.services.embedding.text_chunker import RegulationChunker

        chunker = RegulationChunker(chunk_size=512)
        assert chunker is not None

    def test_chunk_sample_text(self):
        """测试分块示例文本"""
        from app.services.embedding.text_chunker import RegulationChunker

        sample_text = """
# 测试法规文档

第一章 总则

第一条 为规范某项行为，制定本规定。

第二条 本规定适用于相关主体。

第二章 具体要求

第三条 相关主体应当遵守以下要求：

（一）建立管理制度；
（二）配备专业人员；
（三）定期进行检查。
"""

        chunker = RegulationChunker(chunk_size=256)
        chunks = chunker.chunk_document(
            sample_text,
            doc_id="test",
            doc_name="测试法规"
        )

        assert len(chunks) > 0

        # 验证分块包含章节信息
        has_section = any(c.metadata.section_number for c in chunks)
        assert has_section


class TestFullPipeline:
    """完整流程测试"""

    def test_pipeline_without_files(self):
        """测试流程初始化（无文件）"""
        from app.services.document_processor import DocumentProcessor

        processor = DocumentProcessor()
        assert processor is not None

        processor.close()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])