Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -1,185 +1,197 @@
# tests/test_embedding.py
"""嵌入和分块测试"""
"""新架构下的文档编排与 embedding 边界测试。"""
import pytest
from loguru import logger
import sys
import os
from __future__ import annotations
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
from dataclasses import dataclass
from app.services.embedding.text_chunker import RegulationChunker, TextChunk, ChunkMetadata
from app.services.embedding.bge_m3_embedder import BGEM3Embedder, EmbeddingResult
from app.application.documents.services import DocumentCommandService
from app.domain.documents import Chunk, Document, DocumentStatus, ParsedDocument
from app.shared import bootstrap
class TestRegulationChunker:
"""法规分块器测试"""
class FakeRepository:
def __init__(self) -> None:
self.documents: dict[str, Document] = {}
@pytest.fixture
def chunker(self):
"""创建分块器实例"""
return RegulationChunker(chunk_size=512)
def create(self, document: Document) -> Document:
self.documents[document.doc_id] = document
return document
@pytest.fixture
def sample_regulation(self):
"""示例法规文档"""
return """
# GB 7258-2017 机动车运行安全技术条件
def update(self, document: Document) -> Document:
self.documents[document.doc_id] = document
return document
第一章 范围
def get(self, doc_id: str) -> Document | None:
return self.documents.get(doc_id)
第一条 本标准规定了机动车运行安全技术条件。
def list(self, limit: int | None = None) -> list[Document]:
values = list(self.documents.values())
return values[:limit] if limit is not None else values
第二条 本标准适用于在我国道路上行驶的所有机动车。
def update_status(
self,
doc_id: str,
status: DocumentStatus,
*,
error_message: str = "",
chunk_count: int | None = None,
summary: str | None = None,
summary_latency_ms: int | None = None,
parser_name: str | None = None,
index_name: str | None = None,
metadata: dict | None = None,
) -> Document | None:
document = self.documents.get(doc_id)
if not document:
return None
document.status = status
document.error_message = error_message
if chunk_count is not None:
document.chunk_count = chunk_count
if summary is not None:
document.summary = summary
if summary_latency_ms is not None:
document.summary_latency_ms = summary_latency_ms
if parser_name is not None:
document.parser_name = parser_name
if index_name is not None:
document.index_name = index_name
if metadata:
document.metadata.update(metadata)
return document
第二章 术语和定义
第三条 下列术语和定义适用于本标准。
class FakeBinaryStore:
def __init__(self) -> None:
self.saved: dict[str, bytes] = {}
(一)机动车:以动力装置驱动或者牵引,上道路行驶的供人员乘用或者用于运送物品以及进行工程专项作业的轮式车辆。
def save(self, *, object_name: str, data: bytes, content_type: str, metadata: dict[str, str] | None = None) -> None:
self.saved[object_name] = data
(二)整车:完整的机动车,包括所有必要的部件和系统。
def read(self, object_name: str) -> bytes:
return self.saved[object_name]
第三章 技术要求
def delete(self, object_name: str) -> None:
self.saved.pop(object_name, None)
第四条 机动车应满足以下基本要求:
1. 车辆应具有唯一的产品标识;
2. 车辆结构应安全可靠;
3. 车辆应配备必要的安全装置。
"""
def test_chunk_document(self, chunker, sample_regulation):
"""测试文档分块"""
chunks = chunker.chunk_document(
sample_regulation,
doc_id="gb7258",
doc_name="GB 7258-2017",
regulation_type="车辆安全"
class FakeParser:
def parse(self, *, file_path: str, doc_id: str, doc_name: str) -> ParsedDocument:
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
structure_nodes=[{"title": "第一章"}],
semantic_blocks=[{"semantic_id": "semantic-1", "text": "法规正文", "section_title": "第一章"}],
vector_chunks=[
{
"chunk_id": f"{doc_id}-chunk-1",
"semantic_id": "semantic-1",
"chunk_type": "section_text",
"section_title": "第一章",
"section_path": ["第一章"],
"page_start": 1,
"text": "法规正文",
"embedding_text": "标准:测试\n章节:第一章\n\n法规正文",
}
],
parser_name="fake_parser",
)
# 应该有多个分块
assert len(chunks) > 3
# 每个分块应该有内容
for chunk in chunks:
assert len(chunk.content) > 0
assert chunk.metadata.doc_id == "gb7258"
def test_section_detection(self, chunker, sample_regulation):
"""测试章节检测"""
chunks = chunker.chunk_document(
sample_regulation,
doc_id="test",
doc_name="测试"
)
# 应该检测到章节
section_numbers = [c.metadata.section_number for c in chunks]
assert any(s for s in section_numbers) # 至少有一个章节编号
def test_clause_detection(self, chunker, sample_regulation):
"""测试条款检测"""
chunks = chunker.chunk_document(
sample_regulation,
doc_id="test",
doc_name="测试"
)
# 应该检测到条款
clause_numbers = [c.metadata.clause_number for c in chunks]
assert any(c for c in clause_numbers) # 至少有一个条款编号
def test_long_clause_split(self, chunker):
"""测试长条款分割"""
long_clause = """
第一条 本条款内容很长,需要进行分割处理。
本条款包含以下多项内容:
1. 第一项内容,这是一个非常长的子项,包含了大量的文字描述,需要进行适当的处理。
2. 第二项内容,这也是一个较长的子项,包含了相关的技术要求和规范说明。
3. 第三项内容,继续描述相关要求和注意事项,确保文档的完整性和规范性。
4. 第四项内容,补充说明其他相关事项,保证内容的全面性。
"""
chunks = chunker.chunk_document(
long_clause,
doc_id="test",
doc_name="测试"
)
# 长条款应该被分割成多个chunk
assert len(chunks) >= 1
class TestBGEM3Embedder:
"""BGE-M3嵌入模型测试"""
@pytest.fixture
def embedder(self):
"""创建嵌入模型实例"""
try:
return BGEM3Embedder()
except Exception as e:
pytest.skip(f"嵌入模型加载失败: {e}")
def test_embed_single(self, embedder):
"""测试单文本嵌入"""
text = "这是一条测试文本"
result = embedder.embed_single(text)
# 应该包含dense和sparse向量
assert 'dense' in result
assert 'sparse' in result
# dense向量维度应该是1024
assert len(result['dense']) == 1024
def test_embed_batch(self, embedder):
"""测试批量嵌入"""
texts = [
"第一条 本标准规定了机动车安全要求",
"第二条 机动车应符合技术条件",
"第三条 生产企业应建立管理体系"
class FakeChunkBuilder:
def build(self, *, parsed_document: ParsedDocument, regulation_type: str, version: str) -> list[Chunk]:
return [
Chunk(
chunk_id=f"{parsed_document.doc_id}-chunk-1",
doc_id=parsed_document.doc_id,
doc_name=parsed_document.doc_name,
content="法规正文",
embedding_text="标准:测试\n章节:第一章\n\n法规正文",
section_title="第一章",
section_path=["第一章"],
page_number=1,
regulation_type=regulation_type,
version=version,
semantic_id="semantic-1",
block_type="section_text",
metadata={"source": "aliyun_vector_chunk"},
)
]
result = embedder.embed(texts)
# 应该返回正确数量的向量
assert len(result.dense_embeddings) == 3
class FakeEmbeddingProvider:
def __init__(self) -> None:
self.calls: list[list[str]] = []
# 维度应该是1024
assert result.dense_embeddings.shape[1] == 1024
def embed_texts(self, texts: list[str]) -> list[list[float]]:
self.calls.append(texts)
return [[0.1] * 1536 for _ in texts]
def test_embed_empty_list(self, embedder):
"""测试空列表嵌入"""
result = embedder.embed([])
# 应该返回空结果
assert len(result.dense_embeddings) == 0
def test_similarity(self, embedder):
"""测试相似度计算"""
import numpy as np
texts = [
"机动车安全标准要求",
"汽车安全技术规范",
"食品安全管理规定" # 不相关文本
]
result = embedder.embed(texts)
# 计算第一个文本与其他文本的相似度
query = result.dense_embeddings[0]
docs = result.dense_embeddings[1:]
similarities = embedder.compute_similarity(query, docs)
# 相关文档的相似度应该更高
assert similarities[0] > similarities[1] # 车辆安全 > 食品安全
def embed_query(self, text: str) -> list[float]:
return [0.2] * 1536
if __name__ == "__main__":
pytest.main([__file__, "-v"])
class FakeVectorIndex:
def __init__(self) -> None:
self.upserts: list[tuple[list[Chunk], list[list[float]]]] = []
def upsert(self, chunks: list[Chunk], vectors: list[list[float]]) -> int:
self.upserts.append((chunks, vectors))
return len(chunks)
def delete_by_document(self, doc_id: str) -> int:
return 0
def search(self, query_vector: list[float], top_k: int, filters: str | None = None):
return []
def health(self) -> dict:
return {"collection_name": "regulations_dense_1536"}
def test_document_command_service_uses_1536_dense_embedding_and_updates_status():
repository = FakeRepository()
binary_store = FakeBinaryStore()
embedding_provider = FakeEmbeddingProvider()
vector_index = FakeVectorIndex()
service = DocumentCommandService(
document_repository=repository,
binary_store=binary_store,
parser=FakeParser(),
chunk_builder=FakeChunkBuilder(),
embedding_provider=embedding_provider,
vector_index=vector_index,
)
result = service.upload_and_process(
doc_id="doc12345",
file_name="test.pdf",
content=b"dummy pdf bytes",
content_type="application/pdf",
doc_name="测试法规",
regulation_type="车辆安全",
version="2026",
generate_summary=False,
)
assert result.status == "indexed"
assert result.num_chunks == 1
assert embedding_provider.calls == [["标准:测试\n章节:第一章\n\n法规正文"]]
assert len(vector_index.upserts) == 1
stored = repository.get("doc12345")
assert stored is not None
assert stored.status == DocumentStatus.INDEXED
assert stored.chunk_count == 1
assert stored.parser_name == "fake_parser"
assert stored.index_name == "regulations_dense_1536"
def test_bootstrap_defaults_to_local_parser_and_chunk_builder():
bootstrap.get_parser.cache_clear()
bootstrap.get_chunk_builder.cache_clear()
parser = bootstrap.get_parser()
chunk_builder = bootstrap.get_chunk_builder()
assert parser.__class__.__name__ == "LocalDocumentParser"
assert chunk_builder.__class__.__name__ == "LocalRegulationChunkBuilder"

View File

@@ -1,137 +1,127 @@
# tests/test_milvus.py
"""Milvus集成测试"""
"""新架构下的检索与 Milvus dense-only 约定测试。"""
import pytest
from loguru import logger
import sys
import os
from __future__ import annotations
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
from app.services.storage.milvus_client import MilvusClient, SearchResult
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
from app.config.settings import settings
from app.application.agent.services import AgentConversationService
from app.application.knowledge.services import KnowledgeRetrievalService
from app.domain.conversation.models import AnswerResult, AnswerSource, ConversationSession
from app.domain.retrieval import RetrievalQuery, RetrievedChunk
class TestMilvusConnection:
"""Milvus连接测试"""
class FakeRetriever:
def __init__(self) -> None:
self.queries: list[RetrievalQuery] = []
def test_connection(self):
"""测试Milvus连接"""
client = MilvusClient()
result = client.connect()
assert result == True
client.disconnect()
def test_create_collection(self):
"""测试创建Collection"""
client = MilvusClient()
client.connect()
result = client.create_collection(recreate=True)
assert result == True
# 检查Collection是否存在
stats = client.get_collection_stats()
assert stats["name"] == settings.milvus_collection
client.disconnect()
class TestMilvusOperations:
"""Milvus操作测试"""
@pytest.fixture
def client(self):
"""创建测试客户端"""
client = MilvusClient()
client.connect()
client.create_collection(recreate=True)
client.load_collection()
yield client
client.disconnect()
def test_insert_and_search(self, client):
"""测试插入和检索"""
from app.services.embedding.text_chunker import TextChunk, ChunkMetadata
# 创建测试数据
chunks = [
TextChunk(
content="第一条 为保障机动车安全技术性能,预防和减少机动车交通事故,保护人身安全,制定本标准。",
metadata=ChunkMetadata(
doc_id="test_doc",
doc_name="测试文档",
chunk_id="test_chunk_1",
clause_number="第一条",
regulation_type="车辆安全"
)
),
TextChunk(
content="第二条 本标准适用于在我国道路上行驶的所有机动车。",
metadata=ChunkMetadata(
doc_id="test_doc",
doc_name="测试文档",
chunk_id="test_chunk_2",
clause_number="第二条",
regulation_type="车辆安全"
)
def retrieve(self, query: RetrievalQuery) -> list[RetrievedChunk]:
self.queries.append(query)
return [
RetrievedChunk(
chunk_id="chunk-1",
doc_id="doc-1",
doc_name="测试法规",
content="法规正文",
score=0.91,
section_title="第一章",
page_number=1,
metadata={"section_title": "第一章"},
)
]
# 生成嵌入
embedder = BGEM3Embedder()
embeddings = embedder.embed([c.content for c in chunks])
def search(self, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
return self.retrieve(RetrievalQuery(query=query, top_k=top_k, filters=filters))
# 插入数据
inserted_ids = client.insert_chunks(chunks, embeddings)
assert len(inserted_ids) == 2
# 执行检索
query = "机动车安全标准"
query_embedding = embedder.embed_single(query)
results = client.hybrid_search(
query_dense=query_embedding['dense'].tolist(),
query_sparse=query_embedding['sparse'],
top_k=2
class FakeAnswerGenerator:
def generate(
self,
*,
query: str,
retrieved_chunks: list[RetrievedChunk],
history: list[dict[str, str]] | None = None,
provider: str | None = None,
model: str | None = None,
prompt_template: str | None = None,
) -> AnswerResult:
return AnswerResult(
answer=f"回答: {query}",
sources=[
AnswerSource(
doc_id=item.doc_id,
doc_name=item.doc_name,
chunk_id=item.chunk_id,
section_title=item.section_title,
page_number=item.page_number,
score=item.score,
content=item.content,
metadata=item.metadata,
)
for item in retrieved_chunks
],
model=model or "deepseek-v4-flash",
latency_ms=12,
retrieved_count=len(retrieved_chunks),
context_tokens=128,
)
assert len(results) > 0
assert "机动车" in results[0].content or "安全" in results[0].content
def stream_generate(self, **kwargs):
sources = [source.__dict__ for source in self.generate(**kwargs).sources]
yield {"event": "sources", "data": sources}
yield {"event": "content", "data": "流式回答"}
yield {"event": "done", "data": {"retrieved_count": 1}}
class TestEmbedding:
"""嵌入模型测试"""
class FakeConversationStore:
def __init__(self) -> None:
self.sessions: dict[str, ConversationSession] = {}
def test_embed_single_text(self):
"""测试单文本嵌入"""
embedder = BGEM3Embedder()
def create_session(self, metadata: dict | None = None) -> ConversationSession:
session = ConversationSession(session_id="sess-1", created_at=1, updated_at=1, metadata=metadata or {})
self.sessions[session.session_id] = session
return session
result = embedder.embed_single("这是一条测试文本")
def get_session(self, session_id: str) -> ConversationSession | None:
return self.sessions.get(session_id)
assert 'dense' in result
assert 'sparse' in result
assert len(result['dense']) == 1024 # BGE-M3默认维度
def save_message(self, session_id: str, *, role: str, content: str, sources: list[dict] | None = None):
session = self.sessions.get(session_id)
if session is None:
return None
session.messages.append(type("Msg", (), {"role": role, "content": content})())
return session
def test_embed_batch(self):
"""测试批量嵌入"""
embedder = BGEM3Embedder()
def delete_session(self, session_id: str) -> bool:
return self.sessions.pop(session_id, None) is not None
texts = [
"第一条 本标准规定了机动车安全要求",
"第二条 机动车应符合以下技术条件",
"第三条 生产企业应建立质量管理体系"
]
result = embedder.embed(texts)
assert len(result.dense_embeddings) == 3
assert result.dense_embeddings.shape[1] == 1024
def list_sessions(self) -> list[dict]:
return [{"session_id": key, "message_count": len(value.messages), "created_at": value.created_at, "updated_at": value.updated_at} for key, value in self.sessions.items()]
if __name__ == "__main__":
pytest.main([__file__, "-v"])
def test_knowledge_retrieval_service_builds_retrieval_query():
retriever = FakeRetriever()
service = KnowledgeRetrievalService(retriever=retriever)
results = service.retrieve(query="机动车安全", top_k=3, filters='doc_name == "测试法规"')
assert len(results) == 1
assert retriever.queries[0].query == "机动车安全"
assert retriever.queries[0].top_k == 3
assert retriever.queries[0].filters == 'doc_name == "测试法规"'
def test_agent_conversation_service_reuses_shared_retrieval_service():
retriever = FakeRetriever()
retrieval_service = KnowledgeRetrievalService(retriever=retriever)
conversation_store = FakeConversationStore()
service = AgentConversationService(
retrieval_service=retrieval_service,
answer_generator=FakeAnswerGenerator(),
conversation_store=conversation_store,
)
session_id, result = service.chat(query="问一个问题", top_k=2, model="qwen3.5-flash")
assert session_id == "sess-1"
assert result.answer == "回答: 问一个问题"
assert result.retrieved_count == 1
assert retriever.queries[0].top_k == 2
assert len(conversation_store.sessions["sess-1"].messages) == 2

View File

@@ -1,119 +1,236 @@
# tests/test_parser.py
"""文档解析测试"""
"""API contract checks for the migrated backend architecture."""
import pytest
from loguru import logger
import sys
import os
from __future__ import annotations
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
from dataclasses import dataclass
from app.services.parser.pdf_parser import PDFParser, parse_pdf_to_markdown
from app.services.parser.docx_parser import DocxParser, parse_docx_to_markdown
from app.services.parser.mineru_parser import MinerUParser, ParserOrchestrator
from fastapi.testclient import TestClient
from app.api.main import app
from app.application.documents import DocumentProcessResult
from app.domain.conversation.models import AnswerResult, AnswerSource, ConversationSession
from app.domain.documents import Document, DocumentStatus
from app.domain.retrieval import RetrievedChunk
class TestPDFParser:
"""PDF解析测试"""
def test_parser_initialization(self):
"""测试PDF解析器初始化"""
parser = PDFParser()
assert parser is not None
def test_parse_sample_pdf(self):
"""测试解析示例PDF如果有"""
# 如果有示例PDF文件可以在此测试
sample_pdf = os.path.join(os.path.dirname(__file__), "sample.pdf")
if os.path.exists(sample_pdf):
parser = PDFParser()
result = parser.parse(sample_pdf)
assert result.total_pages > 0
assert len(result.pages) > 0
assert len(result.markdown_text) > 0
@dataclass
class FakeMessage:
role: str
content: str
class TestDocxParser:
"""Word文档解析测试"""
def test_parser_initialization(self):
"""测试Word解析器初始化"""
parser = DocxParser()
assert parser is not None
def test_parse_sample_docx(self):
"""测试解析示例DOCX"""
sample_docx = os.path.join(os.path.dirname(__file__), "sample.docx")
if os.path.exists(sample_docx):
parser = DocxParser()
result = parser.parse(sample_docx)
assert len(result.paragraphs) > 0
assert len(result.markdown_text) > 0
class TestChunker:
"""分块器测试"""
def test_chunker_initialization(self):
"""测试分块器初始化"""
from app.services.embedding.text_chunker import RegulationChunker
chunker = RegulationChunker(chunk_size=512)
assert chunker is not None
def test_chunk_sample_text(self):
"""测试分块示例文本"""
from app.services.embedding.text_chunker import RegulationChunker
sample_text = """
# 测试法规文档
第一章 总则
第一条 为规范某项行为,制定本规定。
第二条 本规定适用于相关主体。
第二章 具体要求
第三条 相关主体应当遵守以下要求:
(一)建立管理制度;
(二)配备专业人员;
(三)定期进行检查。
"""
chunker = RegulationChunker(chunk_size=256)
chunks = chunker.chunk_document(
sample_text,
doc_id="test",
doc_name="测试法规"
class FakeDocumentCommandService:
def upload_and_process(self, **kwargs) -> DocumentProcessResult:
return DocumentProcessResult(
doc_id="doc-api-1",
doc_name=kwargs.get("doc_name") or "test.pdf",
status="indexed",
message="处理成功",
num_chunks=2,
summary="",
summary_latency_ms=0,
)
assert len(chunks) > 0
# 验证分块包含章节信息
has_section = any(c.metadata.section_number for c in chunks)
assert has_section
class FakeDocumentQueryService:
def get(self, doc_id: str) -> Document | None:
if doc_id != "doc-api-1":
return None
return Document(
doc_id=doc_id,
doc_name="测试法规",
file_name="test.pdf",
object_name="doc-api-1/test.pdf",
content_type="application/pdf",
size_bytes=12,
status=DocumentStatus.INDEXED,
chunk_count=2,
)
def list_documents(self, limit: int | None = None) -> list[Document]:
documents = [
Document(
doc_id="doc-api-1",
doc_name="测试法规",
file_name="test.pdf",
object_name="doc-api-1/test.pdf",
content_type="application/pdf",
size_bytes=12,
status=DocumentStatus.INDEXED,
chunk_count=2,
)
]
return documents[:limit] if limit is not None else documents
def download(self, doc_id: str) -> tuple[Document, bytes]:
document = self.get(doc_id)
if document is None:
raise FileNotFoundError(doc_id)
return document, b"pdf-bytes"
class TestFullPipeline:
"""完整流程测试"""
def test_pipeline_without_files(self):
"""测试流程初始化(无文件)"""
from app.services.document_processor import DocumentProcessor
processor = DocumentProcessor()
assert processor is not None
processor.close()
class FakeRetrievalService:
def retrieve(self, *, query: str, top_k: int, filters: str | None = None) -> list[RetrievedChunk]:
return [
RetrievedChunk(
chunk_id="chunk-1",
doc_id="doc-api-1",
doc_name="测试法规",
content=f"关于 {query} 的法规内容",
score=0.92,
section_title="第一章",
page_number=1,
metadata={"filters": filters or ""},
)
]
if __name__ == "__main__":
pytest.main([__file__, "-v"])
class FakeConversationStore:
def __init__(self) -> None:
self.session = ConversationSession(
session_id="sess-1",
created_at=1,
updated_at=1,
messages=[FakeMessage(role="user", content="历史问题"), FakeMessage(role="assistant", content="历史回答")],
)
def get_session(self, session_id: str) -> ConversationSession | None:
if session_id == "sess-1":
return self.session
return None
def delete_session(self, session_id: str) -> bool:
return session_id == "sess-1"
def list_sessions(self) -> list[dict]:
return [{"session_id": "sess-1", "message_count": len(self.session.messages), "created_at": 1, "updated_at": 1}]
class FakeAgentConversationService:
def ask(self, **kwargs):
result = AnswerResult(
answer="这是基于法规上下文的回答",
sources=[
AnswerSource(
doc_id="doc-api-1",
doc_name="测试法规",
chunk_id="chunk-1",
section_title="第一章",
page_number=1,
score=0.92,
content="法规原文",
metadata={"section_title": "第一章"},
)
],
model=kwargs.get("model") or "qwen3.5-flash",
latency_ms=11,
retrieved_count=1,
context_tokens=128,
truncated=False,
error=None,
)
return None, result
def chat(self, **kwargs):
result = AnswerResult(
answer="会话回答",
sources=[],
model=kwargs.get("model") or "qwen3.5-flash",
latency_ms=12,
retrieved_count=1,
context_tokens=64,
truncated=False,
error=None,
)
return "sess-1", result
def stream_chat(self, **kwargs):
return "sess-1", iter(
[
{"event": "status", "data": "正在处理"},
{"event": "content", "data": "流式回答"},
{"event": "done", "data": {"retrieved_count": 1}},
]
)
def test_documents_upload_contract_preserved(monkeypatch):
from app.api.routes import documents
monkeypatch.setattr(documents, "get_document_command_service", lambda: FakeDocumentCommandService())
client = TestClient(app)
response = client.post(
"/api/v1/documents/upload",
files={"file": ("test.pdf", b"dummy-pdf", "application/pdf")},
data={"doc_name": "测试法规", "regulation_type": "车辆安全", "version": "2026"},
)
assert response.status_code == 200
payload = response.json()
assert payload["doc_id"] == "doc-api-1"
assert payload["doc_name"] == "测试法规"
assert payload["status"] == "indexed"
assert payload["num_chunks"] == 2
def test_documents_query_contract_preserved(monkeypatch):
from app.api.routes import documents
monkeypatch.setattr(documents, "get_document_query_service", lambda: FakeDocumentQueryService())
client = TestClient(app)
status_response = client.get("/api/v1/documents/status/doc-api-1")
assert status_response.status_code == 200
assert status_response.json()["status"] == "indexed"
list_response = client.get("/api/v1/documents/list")
assert list_response.status_code == 200
assert list_response.json()["total"] == 1
download_response = client.get("/api/v1/documents/download/doc-api-1")
assert download_response.status_code == 200
assert download_response.content == b"pdf-bytes"
def test_knowledge_retrieval_contract_preserved(monkeypatch):
from app.api.routes import knowledge
monkeypatch.setattr(knowledge, "get_retrieval_service", lambda: FakeRetrievalService())
client = TestClient(app)
response = client.post(
"/api/v1/knowledge/retrieval",
json={"query": "机动车安全", "top_k": 3, "filters": 'doc_id == "doc-api-1"'},
)
assert response.status_code == 200
payload = response.json()
assert payload["query"] == "机动车安全"
assert payload["total"] == 1
assert payload["results"][0]["metadata"]["doc_id"] == "doc-api-1"
assert payload["results"][0]["metadata"]["section_title"] == "第一章"
def test_agent_ask_and_stream_contract_preserved(monkeypatch):
from app.api.routes import agent
store = FakeConversationStore()
monkeypatch.setattr(agent, "get_agent_conversation_service", lambda: FakeAgentConversationService())
monkeypatch.setattr(agent, "get_conversation_store", lambda: store)
client = TestClient(app)
ask_response = client.post("/api/v1/agent/ask", json={"query": "这个法规要求什么?"})
assert ask_response.status_code == 200
ask_payload = ask_response.json()
assert ask_payload["answer"] == "这是基于法规上下文的回答"
assert ask_payload["retrieved_count"] == 1
assert ask_payload["sources"][0]["doc_id"] == "doc-api-1"
stream_response = client.get("/api/v1/agent/chat/stream", params={"query": "继续说明"})
assert stream_response.status_code == 200
assert stream_response.headers["content-type"].startswith("text/event-stream")
assert "event: session" in stream_response.text
assert "event: content" in stream_response.text

View File

@@ -1,223 +1,211 @@
"""
MVP功能验证脚本
Post-migration backend smoke checks.
用于验证完整的文档处理流程:
1. PDF/DOCX解析
2. 智能分块
3. 向量嵌入
4. Milvus入库
5. 混合检索
使用方法:
1. 首先启动Milvus: docker-compose up -d
2. 运行此脚本: python verify_mvp.py
Purpose:
1. Verify the new architecture modules can be imported
2. Verify migration-critical config matches the RFC
3. Verify external dependencies when they are available
4. Optionally verify the real ingest path with a sample document
"""
import os
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "backend"))
from loguru import logger
from app.config.logging import setup_logging
from app.services.document_processor import DocumentProcessor, ProcessingResult
from app.services.storage.milvus_client import MilvusClient
from app.config.settings import settings
# 设置日志
from app.config.logging import setup_logging
from app.config.settings import settings
from app.shared.bootstrap import (
get_document_command_service,
get_retrieval_service,
get_vector_index,
)
setup_logging(level="INFO")
def verify_milvus_connection():
"""验证Milvus连接"""
logger.info("=" * 50)
logger.info("Step 1: 验证Milvus连接")
logger.info("=" * 50)
client = MilvusClient()
try:
result = client.connect()
if result:
logger.success("Milvus连接成功")
# 创建Collection
client.create_collection(recreate=True)
stats = client.get_collection_stats()
logger.info(f"Collection信息: {stats}")
client.disconnect()
return True
else:
logger.error("Milvus连接失败请检查docker-compose是否启动")
return False
except Exception as e:
logger.error(f"Milvus连接异常: {e}")
logger.info("请先启动Milvus: cd docker && docker-compose up -d")
return False
def verify_embedding_model():
"""验证嵌入模型"""
logger.info("=" * 50)
logger.info("Step 2: 验证BGE-M3嵌入模型")
logger.info("=" * 50)
try:
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
embedder = BGEM3Embedder()
logger.success("嵌入模型加载成功")
# 测试嵌入
test_text = "这是一条测试文本,用于验证嵌入模型功能"
result = embedder.embed_single(test_text)
logger.info(f"Dense向量维度: {len(result['dense'])}")
logger.info(f"Sparse向量词数: {len(result['sparse'])}")
return True
except Exception as e:
logger.error(f"嵌入模型验证失败: {e}")
logger.info("请确保已安装FlagEmbedding: pip install FlagEmbedding")
return False
def verify_sample_document():
"""验证示例文档处理"""
logger.info("=" * 50)
logger.info("Step 3: 验证文档处理流程")
logger.info("=" * 50)
# 使用内置的示例文本(无需外部文件)
sample_text = """
# GB 7258-2017 机动车运行安全技术条件
第一章 范围
第一条 本标准规定了机动车运行安全技术条件,适用于在我国道路上行驶的所有机动车。
第二条 本标准包括整车、发动机、传动系、行驶系、制动系、照明与信号装置等技术要求。
第二章 术语和定义
第三条 下列术语和定义适用于本标准:
(一)机动车:以动力装置驱动或者牵引,上道路行驶的供人员乘用或者用于运送物品的轮式车辆。
(二)整车产品:完整的机动车产品,包括所有必要的部件和系统。
第三章 整车技术要求
第四条 机动车整车应满足以下基本技术要求:
1. 车辆外廓尺寸应符合规定限值;
2. 车辆应具有唯一的产品标识;
3. 车辆结构应安全可靠,各部件连接牢固。
第五条 车辆应配备必要的安全装置,包括:
- 制动系统
- 照明与信号装置
- 安全带
- 灭火器
"""
try:
from app.services.embedding.text_chunker import RegulationChunker
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
from app.services.storage.milvus_client import MilvusClient
# 1. 分块
logger.info("测试分块...")
chunker = RegulationChunker(chunk_size=256)
chunks = chunker.chunk_document(
sample_text,
doc_id="gb7258_test",
doc_name="GB 7258-2017 测试",
regulation_type="车辆安全"
)
logger.success(f"分块完成,共{len(chunks)}个chunk")
# 2. 嵌入
logger.info("测试嵌入...")
embedder = BGEM3Embedder()
embeddings = embedder.embed([c.content for c in chunks])
logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}")
# 3. 入库
logger.info("测试入库...")
client = MilvusClient()
client.connect()
client.create_collection(recreate=False)
client.load_collection()
inserted_ids = client.insert_chunks(chunks, embeddings)
logger.success(f"入库完成,共{len(inserted_ids)}条记录")
# 4. 检索
logger.info("测试检索...")
query = "机动车安全技术要求"
query_emb = embedder.embed_single(query)
results = client.hybrid_search(
query_dense=query_emb['dense'].tolist(),
query_sparse=query_emb['sparse'],
top_k=3
)
logger.success(f"检索完成,返回{len(results)}条结果")
for i, r in enumerate(results):
logger.info(f"结果{i+1}: 分数={r.score:.4f}, 内容={r.content[:50]}...")
client.disconnect()
return True
except Exception as e:
logger.error(f"文档处理验证失败: {e}")
return False
def main():
"""主验证流程"""
logger.info("\n" + "=" * 60)
logger.info("AI+合规智能中枢 MVP功能验证")
def verify_service_wiring() -> bool:
"""Verify the new module layout and service entrypoints can be imported."""
logger.info("=" * 60)
logger.info("Step 1: verify module wiring")
logger.info("=" * 60)
results = []
try:
from app.api.main import app
from app.application.agent import AgentConversationService
from app.application.documents import DocumentCommandService, DocumentQueryService
from app.application.knowledge import KnowledgeRetrievalService
from app.shared import bootstrap
# 1. Milvus连接验证
results.append(("Milvus连接", verify_milvus_connection()))
assert app is not None
assert DocumentCommandService is not None
assert DocumentQueryService is not None
assert KnowledgeRetrievalService is not None
assert AgentConversationService is not None
assert bootstrap is not None
logger.success("module wiring ok")
return True
except Exception as exc:
logger.error(f"module wiring failed: {exc}")
return False
# 2. 嵌入模型验证
results.append(("嵌入模型", verify_embedding_model()))
# 3. 文档处理验证
results.append(("文档处理", verify_sample_document()))
def verify_migration_config() -> bool:
"""Verify migration-critical config values."""
logger.info("=" * 60)
logger.info("Step 2: verify migration config")
logger.info("=" * 60)
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1536
assert settings.milvus_collection == "regulations_dense_1536"
logger.info(f"embedding_model={settings.embedding_model}")
logger.info(f"embedding_dim={settings.embedding_dim}")
logger.info(f"milvus_collection={settings.milvus_collection}")
logger.success("migration config ok")
return True
except Exception as exc:
logger.error(f"migration config mismatch: {exc}")
return False
def verify_minio_connection() -> bool:
"""Verify MinIO connectivity for the binary store path."""
logger.info("=" * 60)
logger.info("Step 3: verify MinIO connection")
logger.info("=" * 60)
try:
binary_store = get_document_command_service().binary_store
assert binary_store is not None
logger.success("MinIO connection ok")
return True
except Exception as exc:
logger.error(f"MinIO connection failed: {exc}")
logger.info("start MinIO first or update .env storage settings")
return False
def verify_milvus_connection() -> bool:
"""Verify dense-only Milvus adapter connectivity."""
logger.info("=" * 60)
logger.info("Step 4: verify Milvus connection")
logger.info("=" * 60)
try:
health = get_vector_index().health()
logger.info(f"Milvus health: {health}")
logger.success("Milvus connection ok")
return True
except Exception as exc:
logger.error(f"Milvus connection failed: {exc}")
logger.info("start Milvus first or update .env vector settings")
return False
def verify_ingest_pipeline(sample_file: Path) -> bool:
"""Verify upload -> parse -> embed -> index using a real file."""
logger.info("=" * 60)
logger.info("Step 5: verify real ingest pipeline")
logger.info("=" * 60)
if not sample_file.exists():
logger.error(f"sample file not found: {sample_file}")
return False
if sample_file.suffix.lower() not in {".pdf", ".doc", ".docx"}:
logger.error("sample file must be PDF, DOC, or DOCX")
return False
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
logger.error("missing Aliyun parser credentials")
return False
try:
result = get_document_command_service().upload_and_process(
file_name=sample_file.name,
content=sample_file.read_bytes(),
content_type=_guess_content_type(sample_file),
doc_name=sample_file.stem,
regulation_type="smoke-test",
version="migration",
generate_summary=False,
)
logger.info(f"process result: doc_id={result.doc_id}, status={result.status}, chunks={result.num_chunks}")
if result.status != "indexed":
logger.error(f"ingest failed: {result.message}")
return False
retrieval_results = get_retrieval_service().retrieve(
query=sample_file.stem,
top_k=3,
filters=f'doc_id == "{result.doc_id}"',
)
logger.info(f"retrieval count: {len(retrieval_results)}")
logger.success("real ingest pipeline ok")
return True
except Exception as exc:
logger.error(f"real ingest pipeline failed: {exc}")
return False
def _guess_content_type(sample_file: Path) -> str:
suffix = sample_file.suffix.lower()
if suffix == ".pdf":
return "application/pdf"
if suffix == ".doc":
return "application/msword"
if suffix == ".docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return "application/octet-stream"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Verify the migrated backend path")
parser.add_argument("--sample-file", type=Path, help="Optional PDF/DOC/DOCX for real ingest verification")
return parser.parse_args()
def main() -> bool:
args = parse_args()
results = [
("module_wiring", verify_service_wiring()),
("migration_config", verify_migration_config()),
("minio_connection", verify_minio_connection()),
("milvus_connection", verify_milvus_connection()),
]
if args.sample_file:
results.append(("real_ingest_pipeline", verify_ingest_pipeline(args.sample_file)))
else:
logger.info("no sample file provided; skip real ingest check")
# 输出结果汇总
logger.info("\n" + "=" * 60)
logger.info("验证结果汇总")
logger.info("check summary")
logger.info("=" * 60)
all_passed = True
for name, passed in results:
status = "✅ 通过" if passed else "❌ 失败"
status = "PASS" if passed else "FAIL"
logger.info(f"{name}: {status}")
if not passed:
all_passed = False
if all_passed:
logger.success("\n🎉 所有验证通过MVP功能正常")
logger.success("all executed checks passed")
else:
logger.warning("\n⚠️ 部分验证失败,请检查配置和环境")
logger.warning("some checks failed; inspect environment dependencies")
return all_passed
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)
sys.exit(0 if main() else 1)