first commit

2026-04-28 11:29:33 +08:00
commit c2a398930d
44 changed files with 5723 additions and 0 deletions
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -0,0 +1,118 @@
+# tests/test_parser.py
+"""文档解析测试"""
+
+import pytest
+from loguru import logger
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+
+from src.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
+from src.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
+from src.services.parser.mineru_parser import MinerUParser, ParserOrchestrator
+
+
+class TestPDFParser:
+    """PDF解析测试"""
+
+    def test_parser_initialization(self):
+        """测试PDF解析器初始化"""
+        parser = PDFParser()
+        assert parser is not None
+
+    def test_parse_sample_pdf(self):
+        """测试解析示例PDF（如果有）"""
+        # 如果有示例PDF文件，可以在此测试
+        sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")
+
+        if os.path.exists(sample_pdf):
+            parser = PDFParser()
+            result = parser.parse(sample_pdf)
+
+            assert result.total_pages > 0
+            assert len(result.pages) > 0
+            assert len(result.markdown_text) > 0
+
+
+class TestDocxParser:
+    """Word文档解析测试"""
+
+    def test_parser_initialization(self):
+        """测试Word解析器初始化"""
+        parser = DocxParser()
+        assert parser is not None
+
+    def test_parse_sample_docx(self):
+        """测试解析示例DOCX"""
+        sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")
+
+        if os.path.exists(sample_docx):
+            parser = DocxParser()
+            result = parser.parse(sample_docx)
+
+            assert len(result.paragraphs) > 0
+            assert len(result.markdown_text) > 0
+
+
+class TestChunker:
+    """分块器测试"""
+
+    def test_chunker_initialization(self):
+        """测试分块器初始化"""
+        from src.services.embedding.text_chunker import RegulationChunker
+
+        chunker = RegulationChunker(chunk_size=512)
+        assert chunker is not None
+
+    def test_chunk_sample_text(self):
+        """测试分块示例文本"""
+        from src.services.embedding.text_chunker import RegulationChunker
+
+        sample_text = """
+# 测试法规文档
+
+第一章 总则
+
+第一条 为规范某项行为，制定本规定。
+
+第二条 本规定适用于相关主体。
+
+第二章 具体要求
+
+第三条 相关主体应当遵守以下要求：
+
+（一）建立管理制度；
+（二）配备专业人员；
+（三）定期进行检查。
+"""
+
+        chunker = RegulationChunker(chunk_size=256)
+        chunks = chunker.chunk_document(
+            sample_text,
+            doc_id="test",
+            doc_name="测试法规"
+        )
+
+        assert len(chunks) > 0
+
+        # 验证分块包含章节信息
+        has_section = any(c.metadata.section_number for c in chunks)
+        assert has_section
+
+
+class TestFullPipeline:
+    """完整流程测试"""
+
+    def test_pipeline_without_files(self):
+        """测试流程初始化（无文件）"""
+        from src.services.document_processor import DocumentProcessor
+
+        processor = DocumentProcessor()
+        assert processor is not None
+
+        processor.close()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])