Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -1,119 +1,236 @@
# tests/test_parser.py
"""文档解析测试"""
"""API contract checks for the migrated backend architecture."""
import pytest
from loguru import logger
import sys
import os
from __future__ import annotations
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
from dataclasses import dataclass
from app.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
from app.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
from app.services.parser.mineru_parser import MinerUParser, ParserOrchestrator
from fastapi.testclient import TestClient
from app.api.main import app
from app.application.documents import DocumentProcessResult
from app.domain.conversation.models import AnswerResult, AnswerSource, ConversationSession
from app.domain.documents import Document, DocumentStatus
from app.domain.retrieval import RetrievedChunk
class TestPDFParser:
"""PDF解析测试"""
def test_parser_initialization(self):
"""测试PDF解析器初始化"""
parser = PDFParser()
assert parser is not None
def test_parse_sample_pdf(self):
"""测试解析示例PDF如果有"""
# 如果有示例PDF文件可以在此测试
sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")
if os.path.exists(sample_pdf):
parser = PDFParser()
result = parser.parse(sample_pdf)
assert result.total_pages > 0
assert len(result.pages) > 0
assert len(result.markdown_text) > 0
@dataclass
class FakeMessage:
role: str
content: str
class TestDocxParser:
"""Word文档解析测试"""
def test_parser_initialization(self):
"""测试Word解析器初始化"""
parser = DocxParser()
assert parser is not None
def test_parse_sample_docx(self):
"""测试解析示例DOCX"""
sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")
if os.path.exists(sample_docx):
parser = DocxParser()
result = parser.parse(sample_docx)
assert len(result.paragraphs) > 0
assert len(result.markdown_text) > 0
class TestChunker:
"""分块器测试"""
def test_chunker_initialization(self):
"""测试分块器初始化"""
from app.services.embedding.text_chunker import RegulationChunker
chunker = RegulationChunker(chunk_size=512)
assert chunker is not None
def test_chunk_sample_text(self):
"""测试分块示例文本"""
from app.services.embedding.text_chunker import RegulationChunker
sample_text = """
# 测试法规文档
第一章 总则
第一条 为规范某项行为,制定本规定。
第二条 本规定适用于相关主体。
第二章 具体要求
第三条 相关主体应当遵守以下要求:
(一)建立管理制度;
(二)配备专业人员;
(三)定期进行检查。
"""
chunker = RegulationChunker(chunk_size=256)
chunks = chunker.chunk_document(
sample_text,
doc_id="test",
doc_name="测试法规"
class FakeDocumentCommandService:
def upload_and_process(self, **kwargs) -> DocumentProcessResult:
return DocumentProcessResult(
doc_id="doc-api-1",
doc_name=kwargs.get("doc_name") or "test.pdf",
status="indexed",
message="处理成功",
num_chunks=2,
summary="",
summary_latency_ms=0,
)
assert len(chunks) > 0
# 验证分块包含章节信息
has_section = any(c.metadata.section_number for c in chunks)
assert has_section
class FakeDocumentQueryService:
def get(self, doc_id: str) -> Document | None:
if doc_id != "doc-api-1":
return None
return Document(
doc_id=doc_id,
doc_name="测试法规",
file_name="test.pdf",
object_name="doc-api-1/test.pdf",
content_type="application/pdf",
size_bytes=12,
status=DocumentStatus.INDEXED,
chunk_count=2,
)
def list_documents(self, limit: int | None = None) -> list[Document]:
documents = [
Document(
doc_id="doc-api-1",
doc_name="测试法规",
file_name="test.pdf",
object_name="doc-api-1/test.pdf",
content_type="application/pdf",
size_bytes=12,
status=DocumentStatus.INDEXED,
chunk_count=2,
)
]
return documents[:limit] if limit is not None else documents
def download(self, doc_id: str) -> tuple[Document, bytes]:
document = self.get(doc_id)
if document is None:
raise FileNotFoundError(doc_id)
return document, b"pdf-bytes"
class TestFullPipeline:
"""完整流程测试"""
def test_pipeline_without_files(self):
"""测试流程初始化(无文件)"""
from app.services.document_processor import DocumentProcessor
processor = DocumentProcessor()
assert processor is not None
processor.close()
class FakeRetrievalService:
def retrieve(self, *, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
return [
RetrievedChunk(
chunk_id="chunk-1",
doc_id="doc-api-1",
doc_name="测试法规",
content=f"关于 {query} 的法规内容",
score=0.92,
section_title="第一章",
page_number=1,
metadata={"filters": filters or ""},
)
]
if __name__ == "__main__":
pytest.main([__file__, "-v"])
class FakeConversationStore:
def __init__(self) -> None:
self.session = ConversationSession(
session_id="sess-1",
created_at=1,
updated_at=1,
messages=[FakeMessage(role="user", content="历史问题"), FakeMessage(role="assistant", content="历史回答")],
)
def get_session(self, session_id: str) -> ConversationSession | None:
if session_id == "sess-1":
return self.session
return None
def delete_session(self, session_id: str) -> bool:
return session_id == "sess-1"
def list_sessions(self) -> list[dict]:
return [{"session_id": "sess-1", "message_count": len(self.session.messages), "created_at": 1, "updated_at": 1}]
class FakeAgentConversationService:
def ask(self, **kwargs):
result = AnswerResult(
answer="这是基于法规上下文的回答",
sources=[
AnswerSource(
doc_id="doc-api-1",
doc_name="测试法规",
chunk_id="chunk-1",
section_title="第一章",
page_number=1,
score=0.92,
content="法规原文",
metadata={"section_title": "第一章"},
)
],
model=kwargs.get("model") or "qwen3.5-flash",
latency_ms=11,
retrieved_count=1,
context_tokens=128,
truncated=False,
error=None,
)
return None, result
def chat(self, **kwargs):
result = AnswerResult(
answer="会话回答",
sources=[],
model=kwargs.get("model") or "qwen3.5-flash",
latency_ms=12,
retrieved_count=1,
context_tokens=64,
truncated=False,
error=None,
)
return "sess-1", result
def stream_chat(self, **kwargs):
return "sess-1", iter(
[
{"event": "status", "data": "正在处理"},
{"event": "content", "data": "流式回答"},
{"event": "done", "data": {"retrieved_count": 1}},
]
)
def test_documents_upload_contract_preserved(monkeypatch):
from app.api.routes import documents
monkeypatch.setattr(documents, "get_document_command_service", lambda: FakeDocumentCommandService())
client = TestClient(app)
response = client.post(
"/api/v1/documents/upload",
files={"file": ("test.pdf", b"dummy-pdf", "application/pdf")},
data={"doc_name": "测试法规", "regulation_type": "车辆安全", "version": "2026"},
)
assert response.status_code == 200
payload = response.json()
assert payload["doc_id"] == "doc-api-1"
assert payload["doc_name"] == "测试法规"
assert payload["status"] == "indexed"
assert payload["num_chunks"] == 2
def test_documents_query_contract_preserved(monkeypatch):
from app.api.routes import documents
monkeypatch.setattr(documents, "get_document_query_service", lambda: FakeDocumentQueryService())
client = TestClient(app)
status_response = client.get("/api/v1/documents/status/doc-api-1")
assert status_response.status_code == 200
assert status_response.json()["status"] == "indexed"
list_response = client.get("/api/v1/documents/list")
assert list_response.status_code == 200
assert list_response.json()["total"] == 1
download_response = client.get("/api/v1/documents/download/doc-api-1")
assert download_response.status_code == 200
assert download_response.content == b"pdf-bytes"
def test_knowledge_retrieval_contract_preserved(monkeypatch):
from app.api.routes import knowledge
monkeypatch.setattr(knowledge, "get_retrieval_service", lambda: FakeRetrievalService())
client = TestClient(app)
response = client.post(
"/api/v1/knowledge/retrieval",
json={"query": "机动车安全", "top_k": 3, "filters": 'doc_id == "doc-api-1"'},
)
assert response.status_code == 200
payload = response.json()
assert payload["query"] == "机动车安全"
assert payload["total"] == 1
assert payload["results"][0]["metadata"]["doc_id"] == "doc-api-1"
assert payload["results"][0]["metadata"]["section_title"] == "第一章"
def test_agent_ask_and_stream_contract_preserved(monkeypatch):
from app.api.routes import agent
store = FakeConversationStore()
monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
monkeypatch.setattr(agent, "get_conversation_store", lambda: store)
client = TestClient(app)
ask_response = client.post("/api/v1/agent/ask", json={"query": "这个法规要求什么?"})
assert ask_response.status_code == 200
ask_payload = ask_response.json()
assert ask_payload["answer"] == "这是基于法规上下文的回答"
assert ask_payload["retrieved_count"] == 1
assert ask_payload["sources"][0]["doc_id"] == "doc-api-1"
stream_response = client.get("/api/v1/agent/chat/stream", params={"query": "继续说明"})
assert stream_response.status_code == 200
assert stream_response.headers["content-type"].startswith("text/event-stream")
assert "event: session" in stream_response.text
assert "event: content" in stream_response.text