Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,119 +1,236 @@
|
||||
# tests/test_parser.py
|
||||
"""文档解析测试"""
|
||||
"""API contract checks for the migrated backend architecture."""
|
||||
|
||||
import pytest
|
||||
from loguru import logger
|
||||
import sys
|
||||
import os
|
||||
from __future__ import annotations
|
||||
|
||||
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
|
||||
from dataclasses import dataclass
|
||||
|
||||
from app.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
|
||||
from app.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
|
||||
from app.services.parser.mineru_parser import MinerUParser, ParserOrchestrator
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from app.api.main import app
|
||||
from app.application.documents import DocumentProcessResult
|
||||
from app.domain.conversation.models import AnswerResult, AnswerSource, ConversationSession
|
||||
from app.domain.documents import Document, DocumentStatus
|
||||
from app.domain.retrieval import RetrievedChunk
|
||||
|
||||
|
||||
class TestPDFParser:
|
||||
"""PDF解析测试"""
|
||||
|
||||
def test_parser_initialization(self):
|
||||
"""测试PDF解析器初始化"""
|
||||
parser = PDFParser()
|
||||
assert parser is not None
|
||||
|
||||
def test_parse_sample_pdf(self):
|
||||
"""测试解析示例PDF(如果有)"""
|
||||
# 如果有示例PDF文件,可以在此测试
|
||||
sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")
|
||||
|
||||
if os.path.exists(sample_pdf):
|
||||
parser = PDFParser()
|
||||
result = parser.parse(sample_pdf)
|
||||
|
||||
assert result.total_pages > 0
|
||||
assert len(result.pages) > 0
|
||||
assert len(result.markdown_text) > 0
|
||||
@dataclass
|
||||
class FakeMessage:
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class TestDocxParser:
|
||||
"""Word文档解析测试"""
|
||||
|
||||
def test_parser_initialization(self):
|
||||
"""测试Word解析器初始化"""
|
||||
parser = DocxParser()
|
||||
assert parser is not None
|
||||
|
||||
def test_parse_sample_docx(self):
|
||||
"""测试解析示例DOCX"""
|
||||
sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")
|
||||
|
||||
if os.path.exists(sample_docx):
|
||||
parser = DocxParser()
|
||||
result = parser.parse(sample_docx)
|
||||
|
||||
assert len(result.paragraphs) > 0
|
||||
assert len(result.markdown_text) > 0
|
||||
|
||||
|
||||
class TestChunker:
|
||||
"""分块器测试"""
|
||||
|
||||
def test_chunker_initialization(self):
|
||||
"""测试分块器初始化"""
|
||||
from app.services.embedding.text_chunker import RegulationChunker
|
||||
|
||||
chunker = RegulationChunker(chunk_size=512)
|
||||
assert chunker is not None
|
||||
|
||||
def test_chunk_sample_text(self):
|
||||
"""测试分块示例文本"""
|
||||
from app.services.embedding.text_chunker import RegulationChunker
|
||||
|
||||
sample_text = """
|
||||
# 测试法规文档
|
||||
|
||||
第一章 总则
|
||||
|
||||
第一条 为规范某项行为,制定本规定。
|
||||
|
||||
第二条 本规定适用于相关主体。
|
||||
|
||||
第二章 具体要求
|
||||
|
||||
第三条 相关主体应当遵守以下要求:
|
||||
|
||||
(一)建立管理制度;
|
||||
(二)配备专业人员;
|
||||
(三)定期进行检查。
|
||||
"""
|
||||
|
||||
chunker = RegulationChunker(chunk_size=256)
|
||||
chunks = chunker.chunk_document(
|
||||
sample_text,
|
||||
doc_id="test",
|
||||
doc_name="测试法规"
|
||||
class FakeDocumentCommandService:
|
||||
def upload_and_process(self, **kwargs) -> DocumentProcessResult:
|
||||
return DocumentProcessResult(
|
||||
doc_id="doc-api-1",
|
||||
doc_name=kwargs.get("doc_name") or "test.pdf",
|
||||
status="indexed",
|
||||
message="处理成功",
|
||||
num_chunks=2,
|
||||
summary="",
|
||||
summary_latency_ms=0,
|
||||
)
|
||||
|
||||
assert len(chunks) > 0
|
||||
|
||||
# 验证分块包含章节信息
|
||||
has_section = any(c.metadata.section_number for c in chunks)
|
||||
assert has_section
|
||||
class FakeDocumentQueryService:
|
||||
def get(self, doc_id: str) -> Document | None:
|
||||
if doc_id != "doc-api-1":
|
||||
return None
|
||||
return Document(
|
||||
doc_id=doc_id,
|
||||
doc_name="测试法规",
|
||||
file_name="test.pdf",
|
||||
object_name="doc-api-1/test.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=12,
|
||||
status=DocumentStatus.INDEXED,
|
||||
chunk_count=2,
|
||||
)
|
||||
|
||||
def list_documents(self, limit: int | None = None) -> list[Document]:
|
||||
documents = [
|
||||
Document(
|
||||
doc_id="doc-api-1",
|
||||
doc_name="测试法规",
|
||||
file_name="test.pdf",
|
||||
object_name="doc-api-1/test.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=12,
|
||||
status=DocumentStatus.INDEXED,
|
||||
chunk_count=2,
|
||||
)
|
||||
]
|
||||
return documents[:limit] if limit is not None else documents
|
||||
|
||||
def download(self, doc_id: str) -> tuple[Document, bytes]:
|
||||
document = self.get(doc_id)
|
||||
if document is None:
|
||||
raise FileNotFoundError(doc_id)
|
||||
return document, b"pdf-bytes"
|
||||
|
||||
|
||||
class TestFullPipeline:
|
||||
"""完整流程测试"""
|
||||
|
||||
def test_pipeline_without_files(self):
|
||||
"""测试流程初始化(无文件)"""
|
||||
from app.services.document_processor import DocumentProcessor
|
||||
|
||||
processor = DocumentProcessor()
|
||||
assert processor is not None
|
||||
|
||||
processor.close()
|
||||
class FakeRetrievalService:
|
||||
def retrieve(self, *, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
|
||||
return [
|
||||
RetrievedChunk(
|
||||
chunk_id="chunk-1",
|
||||
doc_id="doc-api-1",
|
||||
doc_name="测试法规",
|
||||
content=f"关于 {query} 的法规内容",
|
||||
score=0.92,
|
||||
section_title="第一章",
|
||||
page_number=1,
|
||||
metadata={"filters": filters or ""},
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
class FakeConversationStore:
|
||||
def __init__(self) -> None:
|
||||
self.session = ConversationSession(
|
||||
session_id="sess-1",
|
||||
created_at=1,
|
||||
updated_at=1,
|
||||
messages=[FakeMessage(role="user", content="历史问题"), FakeMessage(role="assistant", content="历史回答")],
|
||||
)
|
||||
|
||||
def get_session(self, session_id: str) -> ConversationSession | None:
|
||||
if session_id == "sess-1":
|
||||
return self.session
|
||||
return None
|
||||
|
||||
def delete_session(self, session_id: str) -> bool:
|
||||
return session_id == "sess-1"
|
||||
|
||||
def list_sessions(self) -> list[dict]:
|
||||
return [{"session_id": "sess-1", "message_count": len(self.session.messages), "created_at": 1, "updated_at": 1}]
|
||||
|
||||
|
||||
class FakeAgentConversationService:
|
||||
def ask(self, **kwargs):
|
||||
result = AnswerResult(
|
||||
answer="这是基于法规上下文的回答",
|
||||
sources=[
|
||||
AnswerSource(
|
||||
doc_id="doc-api-1",
|
||||
doc_name="测试法规",
|
||||
chunk_id="chunk-1",
|
||||
section_title="第一章",
|
||||
page_number=1,
|
||||
score=0.92,
|
||||
content="法规原文",
|
||||
metadata={"section_title": "第一章"},
|
||||
)
|
||||
],
|
||||
model=kwargs.get("model") or "qwen3.5-flash",
|
||||
latency_ms=11,
|
||||
retrieved_count=1,
|
||||
context_tokens=128,
|
||||
truncated=False,
|
||||
error=None,
|
||||
)
|
||||
return None, result
|
||||
|
||||
def chat(self, **kwargs):
|
||||
result = AnswerResult(
|
||||
answer="会话回答",
|
||||
sources=[],
|
||||
model=kwargs.get("model") or "qwen3.5-flash",
|
||||
latency_ms=12,
|
||||
retrieved_count=1,
|
||||
context_tokens=64,
|
||||
truncated=False,
|
||||
error=None,
|
||||
)
|
||||
return "sess-1", result
|
||||
|
||||
def stream_chat(self, **kwargs):
|
||||
return "sess-1", iter(
|
||||
[
|
||||
{"event": "status", "data": "正在处理"},
|
||||
{"event": "content", "data": "流式回答"},
|
||||
{"event": "done", "data": {"retrieved_count": 1}},
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def test_documents_upload_contract_preserved(monkeypatch):
|
||||
from app.api.routes import documents
|
||||
|
||||
monkeypatch.setattr(documents, "get_document_command_service", lambda: FakeDocumentCommandService())
|
||||
|
||||
client = TestClient(app)
|
||||
response = client.post(
|
||||
"/api/v1/documents/upload",
|
||||
files={"file": ("test.pdf", b"dummy-pdf", "application/pdf")},
|
||||
data={"doc_name": "测试法规", "regulation_type": "车辆安全", "version": "2026"},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload["doc_id"] == "doc-api-1"
|
||||
assert payload["doc_name"] == "测试法规"
|
||||
assert payload["status"] == "indexed"
|
||||
assert payload["num_chunks"] == 2
|
||||
|
||||
|
||||
def test_documents_query_contract_preserved(monkeypatch):
|
||||
from app.api.routes import documents
|
||||
|
||||
monkeypatch.setattr(documents, "get_document_query_service", lambda: FakeDocumentQueryService())
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
status_response = client.get("/api/v1/documents/status/doc-api-1")
|
||||
assert status_response.status_code == 200
|
||||
assert status_response.json()["status"] == "indexed"
|
||||
|
||||
list_response = client.get("/api/v1/documents/list")
|
||||
assert list_response.status_code == 200
|
||||
assert list_response.json()["total"] == 1
|
||||
|
||||
download_response = client.get("/api/v1/documents/download/doc-api-1")
|
||||
assert download_response.status_code == 200
|
||||
assert download_response.content == b"pdf-bytes"
|
||||
|
||||
|
||||
def test_knowledge_retrieval_contract_preserved(monkeypatch):
|
||||
from app.api.routes import knowledge
|
||||
|
||||
monkeypatch.setattr(knowledge, "get_retrieval_service", lambda: FakeRetrievalService())
|
||||
|
||||
client = TestClient(app)
|
||||
response = client.post(
|
||||
"/api/v1/knowledge/retrieval",
|
||||
json={"query": "机动车安全", "top_k": 3, "filters": 'doc_id == "doc-api-1"'},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload["query"] == "机动车安全"
|
||||
assert payload["total"] == 1
|
||||
assert payload["results"][0]["metadata"]["doc_id"] == "doc-api-1"
|
||||
assert payload["results"][0]["metadata"]["section_title"] == "第一章"
|
||||
|
||||
|
||||
def test_agent_ask_and_stream_contract_preserved(monkeypatch):
|
||||
from app.api.routes import agent
|
||||
|
||||
store = FakeConversationStore()
|
||||
monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
|
||||
monkeypatch.setattr(agent, "get_conversation_store", lambda: store)
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
ask_response = client.post("/api/v1/agent/ask", json={"query": "这个法规要求什么?"})
|
||||
assert ask_response.status_code == 200
|
||||
ask_payload = ask_response.json()
|
||||
assert ask_payload["answer"] == "这是基于法规上下文的回答"
|
||||
assert ask_payload["retrieved_count"] == 1
|
||||
assert ask_payload["sources"][0]["doc_id"] == "doc-api-1"
|
||||
|
||||
stream_response = client.get("/api/v1/agent/chat/stream", params={"query": "继续说明"})
|
||||
assert stream_response.status_code == 200
|
||||
assert stream_response.headers["content-type"].startswith("text/event-stream")
|
||||
assert "event: session" in stream_response.text
|
||||
assert "event: content" in stream_response.text
|
||||
|
||||
Reference in New Issue
Block a user