# tests/test_parser.py """文档解析测试""" import pytest from loguru import logger import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from src.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown from src.services.parser.docx_parser import DocxParser, parse_docx_to_markdown from src.services.parser.mineru_parser import MinerUParser, ParserOrchestrator class TestPDFParser: """PDF解析测试""" def test_parser_initialization(self): """测试PDF解析器初始化""" parser = PDFParser() assert parser is not None def test_parse_sample_pdf(self): """测试解析示例PDF(如果有)""" # 如果有示例PDF文件,可以在此测试 sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf") if os.path.exists(sample_pdf): parser = PDFParser() result = parser.parse(sample_pdf) assert result.total_pages > 0 assert len(result.pages) > 0 assert len(result.markdown_text) > 0 class TestDocxParser: """Word文档解析测试""" def test_parser_initialization(self): """测试Word解析器初始化""" parser = DocxParser() assert parser is not None def test_parse_sample_docx(self): """测试解析示例DOCX""" sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx") if os.path.exists(sample_docx): parser = DocxParser() result = parser.parse(sample_docx) assert len(result.paragraphs) > 0 assert len(result.markdown_text) > 0 class TestChunker: """分块器测试""" def test_chunker_initialization(self): """测试分块器初始化""" from src.services.embedding.text_chunker import RegulationChunker chunker = RegulationChunker(chunk_size=512) assert chunker is not None def test_chunk_sample_text(self): """测试分块示例文本""" from src.services.embedding.text_chunker import RegulationChunker sample_text = """ # 测试法规文档 第一章 总则 第一条 为规范某项行为,制定本规定。 第二条 本规定适用于相关主体。 第二章 具体要求 第三条 相关主体应当遵守以下要求: (一)建立管理制度; (二)配备专业人员; (三)定期进行检查。 """ chunker = RegulationChunker(chunk_size=256) chunks = chunker.chunk_document( sample_text, doc_id="test", doc_name="测试法规" ) assert len(chunks) > 0 # 验证分块包含章节信息 has_section = any(c.metadata.section_number for c in chunks) assert has_section class TestFullPipeline: """完整流程测试""" def test_pipeline_without_files(self): """测试流程初始化(无文件)""" from src.services.document_processor import DocumentProcessor processor = DocumentProcessor() assert processor is not None processor.close() if __name__ == "__main__": pytest.main([__file__, "-v"])