2026-04-28 11:29:33 +08:00
|
|
|
|
# tests/test_parser.py
|
|
|
|
|
|
"""文档解析测试"""
|
|
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
2026-05-14 18:09:15 +08:00
|
|
|
|
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
|
|
|
|
|
|
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
|
2026-04-28 11:29:33 +08:00
|
|
|
|
|
2026-05-14 18:09:15 +08:00
|
|
|
|
from app.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
|
|
|
|
|
|
from app.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
|
|
|
|
|
|
from app.services.parser.mineru_parser import MinerUParser, ParserOrchestrator
|
2026-04-28 11:29:33 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestPDFParser:
|
|
|
|
|
|
"""PDF解析测试"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_parser_initialization(self):
|
|
|
|
|
|
"""测试PDF解析器初始化"""
|
|
|
|
|
|
parser = PDFParser()
|
|
|
|
|
|
assert parser is not None
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_sample_pdf(self):
|
|
|
|
|
|
"""测试解析示例PDF(如果有)"""
|
|
|
|
|
|
# 如果有示例PDF文件,可以在此测试
|
|
|
|
|
|
sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")
|
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists(sample_pdf):
|
|
|
|
|
|
parser = PDFParser()
|
|
|
|
|
|
result = parser.parse(sample_pdf)
|
|
|
|
|
|
|
|
|
|
|
|
assert result.total_pages > 0
|
|
|
|
|
|
assert len(result.pages) > 0
|
|
|
|
|
|
assert len(result.markdown_text) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDocxParser:
|
|
|
|
|
|
"""Word文档解析测试"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_parser_initialization(self):
|
|
|
|
|
|
"""测试Word解析器初始化"""
|
|
|
|
|
|
parser = DocxParser()
|
|
|
|
|
|
assert parser is not None
|
|
|
|
|
|
|
|
|
|
|
|
def test_parse_sample_docx(self):
|
|
|
|
|
|
"""测试解析示例DOCX"""
|
|
|
|
|
|
sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")
|
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists(sample_docx):
|
|
|
|
|
|
parser = DocxParser()
|
|
|
|
|
|
result = parser.parse(sample_docx)
|
|
|
|
|
|
|
|
|
|
|
|
assert len(result.paragraphs) > 0
|
|
|
|
|
|
assert len(result.markdown_text) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestChunker:
|
|
|
|
|
|
"""分块器测试"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_chunker_initialization(self):
|
|
|
|
|
|
"""测试分块器初始化"""
|
2026-05-14 18:09:15 +08:00
|
|
|
|
from app.services.embedding.text_chunker import RegulationChunker
|
2026-04-28 11:29:33 +08:00
|
|
|
|
|
|
|
|
|
|
chunker = RegulationChunker(chunk_size=512)
|
|
|
|
|
|
assert chunker is not None
|
|
|
|
|
|
|
|
|
|
|
|
def test_chunk_sample_text(self):
|
|
|
|
|
|
"""测试分块示例文本"""
|
2026-05-14 18:09:15 +08:00
|
|
|
|
from app.services.embedding.text_chunker import RegulationChunker
|
2026-04-28 11:29:33 +08:00
|
|
|
|
|
|
|
|
|
|
sample_text = """
|
|
|
|
|
|
# 测试法规文档
|
|
|
|
|
|
|
|
|
|
|
|
第一章 总则
|
|
|
|
|
|
|
|
|
|
|
|
第一条 为规范某项行为,制定本规定。
|
|
|
|
|
|
|
|
|
|
|
|
第二条 本规定适用于相关主体。
|
|
|
|
|
|
|
|
|
|
|
|
第二章 具体要求
|
|
|
|
|
|
|
|
|
|
|
|
第三条 相关主体应当遵守以下要求:
|
|
|
|
|
|
|
|
|
|
|
|
(一)建立管理制度;
|
|
|
|
|
|
(二)配备专业人员;
|
|
|
|
|
|
(三)定期进行检查。
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
chunker = RegulationChunker(chunk_size=256)
|
|
|
|
|
|
chunks = chunker.chunk_document(
|
|
|
|
|
|
sample_text,
|
|
|
|
|
|
doc_id="test",
|
|
|
|
|
|
doc_name="测试法规"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
assert len(chunks) > 0
|
|
|
|
|
|
|
|
|
|
|
|
# 验证分块包含章节信息
|
|
|
|
|
|
has_section = any(c.metadata.section_number for c in chunks)
|
|
|
|
|
|
assert has_section
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestFullPipeline:
|
|
|
|
|
|
"""完整流程测试"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_pipeline_without_files(self):
|
|
|
|
|
|
"""测试流程初始化(无文件)"""
|
2026-05-14 18:09:15 +08:00
|
|
|
|
from app.services.document_processor import DocumentProcessor
|
2026-04-28 11:29:33 +08:00
|
|
|
|
|
|
|
|
|
|
processor = DocumentProcessor()
|
|
|
|
|
|
assert processor is not None
|
|
|
|
|
|
|
|
|
|
|
|
processor.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-05-14 18:09:15 +08:00
|
|
|
|
pytest.main([__file__, "-v"])
|