Files
AIRegulation-DocAnalysis/tests/test_parser.py
2026-04-28 11:29:33 +08:00

118 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# tests/test_parser.py
"""文档解析测试"""
import pytest
from loguru import logger
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from src.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
from src.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
from src.services.parser.mineru_parser import MinerUParser, ParserOrchestrator
class TestPDFParser:
"""PDF解析测试"""
def test_parser_initialization(self):
"""测试PDF解析器初始化"""
parser = PDFParser()
assert parser is not None
def test_parse_sample_pdf(self):
"""测试解析示例PDF如果有"""
# 如果有示例PDF文件可以在此测试
sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")
if os.path.exists(sample_pdf):
parser = PDFParser()
result = parser.parse(sample_pdf)
assert result.total_pages > 0
assert len(result.pages) > 0
assert len(result.markdown_text) > 0
class TestDocxParser:
"""Word文档解析测试"""
def test_parser_initialization(self):
"""测试Word解析器初始化"""
parser = DocxParser()
assert parser is not None
def test_parse_sample_docx(self):
"""测试解析示例DOCX"""
sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")
if os.path.exists(sample_docx):
parser = DocxParser()
result = parser.parse(sample_docx)
assert len(result.paragraphs) > 0
assert len(result.markdown_text) > 0
class TestChunker:
"""分块器测试"""
def test_chunker_initialization(self):
"""测试分块器初始化"""
from src.services.embedding.text_chunker import RegulationChunker
chunker = RegulationChunker(chunk_size=512)
assert chunker is not None
def test_chunk_sample_text(self):
"""测试分块示例文本"""
from src.services.embedding.text_chunker import RegulationChunker
sample_text = """
# 测试法规文档
第一章 总则
第一条 为规范某项行为,制定本规定。
第二条 本规定适用于相关主体。
第二章 具体要求
第三条 相关主体应当遵守以下要求:
(一)建立管理制度;
(二)配备专业人员;
(三)定期进行检查。
"""
chunker = RegulationChunker(chunk_size=256)
chunks = chunker.chunk_document(
sample_text,
doc_id="test",
doc_name="测试法规"
)
assert len(chunks) > 0
# 验证分块包含章节信息
has_section = any(c.metadata.section_number for c in chunks)
assert has_section
class TestFullPipeline:
"""完整流程测试"""
def test_pipeline_without_files(self):
"""测试流程初始化(无文件)"""
from src.services.document_processor import DocumentProcessor
processor = DocumentProcessor()
assert processor is not None
processor.close()
if __name__ == "__main__":
pytest.main([__file__, "-v"])