Fix SSE route dependency and align architecture docs
This commit is contained in:
@@ -1,223 +1,211 @@
|
||||
"""
|
||||
MVP功能验证脚本
|
||||
Post-migration backend smoke checks.
|
||||
|
||||
用于验证完整的文档处理流程:
|
||||
1. PDF/DOCX解析
|
||||
2. 智能分块
|
||||
3. 向量嵌入
|
||||
4. Milvus入库
|
||||
5. 混合检索
|
||||
|
||||
使用方法:
|
||||
1. 首先启动Milvus: docker-compose up -d
|
||||
2. 运行此脚本: python verify_mvp.py
|
||||
Purpose:
|
||||
1. Verify the new architecture modules can be imported
|
||||
2. Verify migration-critical config matches the RFC
|
||||
3. Verify external dependencies when they are available
|
||||
4. Optionally verify the real ingest path with a sample document
|
||||
"""
|
||||
|
||||
import os
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
|
||||
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT / "backend"))
|
||||
|
||||
from loguru import logger
|
||||
from app.config.logging import setup_logging
|
||||
from app.services.document_processor import DocumentProcessor, ProcessingResult
|
||||
from app.services.storage.milvus_client import MilvusClient
|
||||
from app.config.settings import settings
|
||||
|
||||
# 设置日志
|
||||
from app.config.logging import setup_logging
|
||||
from app.config.settings import settings
|
||||
from app.shared.bootstrap import (
|
||||
get_document_command_service,
|
||||
get_retrieval_service,
|
||||
get_vector_index,
|
||||
)
|
||||
|
||||
setup_logging(level="INFO")
|
||||
|
||||
|
||||
def verify_milvus_connection():
|
||||
"""验证Milvus连接"""
|
||||
logger.info("=" * 50)
|
||||
logger.info("Step 1: 验证Milvus连接")
|
||||
logger.info("=" * 50)
|
||||
|
||||
client = MilvusClient()
|
||||
|
||||
try:
|
||||
result = client.connect()
|
||||
if result:
|
||||
logger.success("Milvus连接成功")
|
||||
|
||||
# 创建Collection
|
||||
client.create_collection(recreate=True)
|
||||
stats = client.get_collection_stats()
|
||||
logger.info(f"Collection信息: {stats}")
|
||||
|
||||
client.disconnect()
|
||||
return True
|
||||
else:
|
||||
logger.error("Milvus连接失败,请检查docker-compose是否启动")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Milvus连接异常: {e}")
|
||||
logger.info("请先启动Milvus: cd docker && docker-compose up -d")
|
||||
return False
|
||||
|
||||
|
||||
def verify_embedding_model():
|
||||
"""验证嵌入模型"""
|
||||
logger.info("=" * 50)
|
||||
logger.info("Step 2: 验证BGE-M3嵌入模型")
|
||||
logger.info("=" * 50)
|
||||
|
||||
try:
|
||||
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
|
||||
|
||||
embedder = BGEM3Embedder()
|
||||
logger.success("嵌入模型加载成功")
|
||||
|
||||
# 测试嵌入
|
||||
test_text = "这是一条测试文本,用于验证嵌入模型功能"
|
||||
result = embedder.embed_single(test_text)
|
||||
|
||||
logger.info(f"Dense向量维度: {len(result['dense'])}")
|
||||
logger.info(f"Sparse向量词数: {len(result['sparse'])}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"嵌入模型验证失败: {e}")
|
||||
logger.info("请确保已安装FlagEmbedding: pip install FlagEmbedding")
|
||||
return False
|
||||
|
||||
|
||||
def verify_sample_document():
|
||||
"""验证示例文档处理"""
|
||||
logger.info("=" * 50)
|
||||
logger.info("Step 3: 验证文档处理流程")
|
||||
logger.info("=" * 50)
|
||||
|
||||
# 使用内置的示例文本(无需外部文件)
|
||||
sample_text = """
|
||||
# GB 7258-2017 机动车运行安全技术条件
|
||||
|
||||
第一章 范围
|
||||
|
||||
第一条 本标准规定了机动车运行安全技术条件,适用于在我国道路上行驶的所有机动车。
|
||||
|
||||
第二条 本标准包括整车、发动机、传动系、行驶系、制动系、照明与信号装置等技术要求。
|
||||
|
||||
第二章 术语和定义
|
||||
|
||||
第三条 下列术语和定义适用于本标准:
|
||||
|
||||
(一)机动车:以动力装置驱动或者牵引,上道路行驶的供人员乘用或者用于运送物品的轮式车辆。
|
||||
|
||||
(二)整车产品:完整的机动车产品,包括所有必要的部件和系统。
|
||||
|
||||
第三章 整车技术要求
|
||||
|
||||
第四条 机动车整车应满足以下基本技术要求:
|
||||
|
||||
1. 车辆外廓尺寸应符合规定限值;
|
||||
2. 车辆应具有唯一的产品标识;
|
||||
3. 车辆结构应安全可靠,各部件连接牢固。
|
||||
|
||||
第五条 车辆应配备必要的安全装置,包括:
|
||||
- 制动系统
|
||||
- 照明与信号装置
|
||||
- 安全带
|
||||
- 灭火器
|
||||
"""
|
||||
|
||||
try:
|
||||
from app.services.embedding.text_chunker import RegulationChunker
|
||||
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
|
||||
from app.services.storage.milvus_client import MilvusClient
|
||||
|
||||
# 1. 分块
|
||||
logger.info("测试分块...")
|
||||
chunker = RegulationChunker(chunk_size=256)
|
||||
chunks = chunker.chunk_document(
|
||||
sample_text,
|
||||
doc_id="gb7258_test",
|
||||
doc_name="GB 7258-2017 测试",
|
||||
regulation_type="车辆安全"
|
||||
)
|
||||
logger.success(f"分块完成,共{len(chunks)}个chunk")
|
||||
|
||||
# 2. 嵌入
|
||||
logger.info("测试嵌入...")
|
||||
embedder = BGEM3Embedder()
|
||||
embeddings = embedder.embed([c.content for c in chunks])
|
||||
logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}")
|
||||
|
||||
# 3. 入库
|
||||
logger.info("测试入库...")
|
||||
client = MilvusClient()
|
||||
client.connect()
|
||||
client.create_collection(recreate=False)
|
||||
client.load_collection()
|
||||
|
||||
inserted_ids = client.insert_chunks(chunks, embeddings)
|
||||
logger.success(f"入库完成,共{len(inserted_ids)}条记录")
|
||||
|
||||
# 4. 检索
|
||||
logger.info("测试检索...")
|
||||
query = "机动车安全技术要求"
|
||||
query_emb = embedder.embed_single(query)
|
||||
|
||||
results = client.hybrid_search(
|
||||
query_dense=query_emb['dense'].tolist(),
|
||||
query_sparse=query_emb['sparse'],
|
||||
top_k=3
|
||||
)
|
||||
logger.success(f"检索完成,返回{len(results)}条结果")
|
||||
|
||||
for i, r in enumerate(results):
|
||||
logger.info(f"结果{i+1}: 分数={r.score:.4f}, 内容={r.content[:50]}...")
|
||||
|
||||
client.disconnect()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文档处理验证失败: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""主验证流程"""
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("AI+合规智能中枢 MVP功能验证")
|
||||
def verify_service_wiring() -> bool:
|
||||
"""Verify the new module layout and service entrypoints can be imported."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("Step 1: verify module wiring")
|
||||
logger.info("=" * 60)
|
||||
|
||||
results = []
|
||||
try:
|
||||
from app.api.main import app
|
||||
from app.application.agent import AgentConversationService
|
||||
from app.application.documents import DocumentCommandService, DocumentQueryService
|
||||
from app.application.knowledge import KnowledgeRetrievalService
|
||||
from app.shared import bootstrap
|
||||
|
||||
# 1. Milvus连接验证
|
||||
results.append(("Milvus连接", verify_milvus_connection()))
|
||||
assert app is not None
|
||||
assert DocumentCommandService is not None
|
||||
assert DocumentQueryService is not None
|
||||
assert KnowledgeRetrievalService is not None
|
||||
assert AgentConversationService is not None
|
||||
assert bootstrap is not None
|
||||
logger.success("module wiring ok")
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.error(f"module wiring failed: {exc}")
|
||||
return False
|
||||
|
||||
# 2. 嵌入模型验证
|
||||
results.append(("嵌入模型", verify_embedding_model()))
|
||||
|
||||
# 3. 文档处理验证
|
||||
results.append(("文档处理", verify_sample_document()))
|
||||
def verify_migration_config() -> bool:
|
||||
"""Verify migration-critical config values."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("Step 2: verify migration config")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
assert settings.embedding_model == "text-embedding-v3"
|
||||
assert settings.embedding_dim == 1536
|
||||
assert settings.milvus_collection == "regulations_dense_1536"
|
||||
logger.info(f"embedding_model={settings.embedding_model}")
|
||||
logger.info(f"embedding_dim={settings.embedding_dim}")
|
||||
logger.info(f"milvus_collection={settings.milvus_collection}")
|
||||
logger.success("migration config ok")
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.error(f"migration config mismatch: {exc}")
|
||||
return False
|
||||
|
||||
|
||||
def verify_minio_connection() -> bool:
|
||||
"""Verify MinIO connectivity for the binary store path."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("Step 3: verify MinIO connection")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
binary_store = get_document_command_service().binary_store
|
||||
assert binary_store is not None
|
||||
logger.success("MinIO connection ok")
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.error(f"MinIO connection failed: {exc}")
|
||||
logger.info("start MinIO first or update .env storage settings")
|
||||
return False
|
||||
|
||||
|
||||
def verify_milvus_connection() -> bool:
|
||||
"""Verify dense-only Milvus adapter connectivity."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("Step 4: verify Milvus connection")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
health = get_vector_index().health()
|
||||
logger.info(f"Milvus health: {health}")
|
||||
logger.success("Milvus connection ok")
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.error(f"Milvus connection failed: {exc}")
|
||||
logger.info("start Milvus first or update .env vector settings")
|
||||
return False
|
||||
|
||||
|
||||
def verify_ingest_pipeline(sample_file: Path) -> bool:
|
||||
"""Verify upload -> parse -> embed -> index using a real file."""
|
||||
logger.info("=" * 60)
|
||||
logger.info("Step 5: verify real ingest pipeline")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if not sample_file.exists():
|
||||
logger.error(f"sample file not found: {sample_file}")
|
||||
return False
|
||||
|
||||
if sample_file.suffix.lower() not in {".pdf", ".doc", ".docx"}:
|
||||
logger.error("sample file must be PDF, DOC, or DOCX")
|
||||
return False
|
||||
|
||||
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
|
||||
logger.error("missing Aliyun parser credentials")
|
||||
return False
|
||||
|
||||
try:
|
||||
result = get_document_command_service().upload_and_process(
|
||||
file_name=sample_file.name,
|
||||
content=sample_file.read_bytes(),
|
||||
content_type=_guess_content_type(sample_file),
|
||||
doc_name=sample_file.stem,
|
||||
regulation_type="smoke-test",
|
||||
version="migration",
|
||||
generate_summary=False,
|
||||
)
|
||||
logger.info(f"process result: doc_id={result.doc_id}, status={result.status}, chunks={result.num_chunks}")
|
||||
if result.status != "indexed":
|
||||
logger.error(f"ingest failed: {result.message}")
|
||||
return False
|
||||
|
||||
retrieval_results = get_retrieval_service().retrieve(
|
||||
query=sample_file.stem,
|
||||
top_k=3,
|
||||
filters=f'doc_id == "{result.doc_id}"',
|
||||
)
|
||||
logger.info(f"retrieval count: {len(retrieval_results)}")
|
||||
logger.success("real ingest pipeline ok")
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.error(f"real ingest pipeline failed: {exc}")
|
||||
return False
|
||||
|
||||
|
||||
def _guess_content_type(sample_file: Path) -> str:
|
||||
suffix = sample_file.suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
return "application/pdf"
|
||||
if suffix == ".doc":
|
||||
return "application/msword"
|
||||
if suffix == ".docx":
|
||||
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Verify the migrated backend path")
|
||||
parser.add_argument("--sample-file", type=Path, help="Optional PDF/DOC/DOCX for real ingest verification")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> bool:
|
||||
args = parse_args()
|
||||
|
||||
results = [
|
||||
("module_wiring", verify_service_wiring()),
|
||||
("migration_config", verify_migration_config()),
|
||||
("minio_connection", verify_minio_connection()),
|
||||
("milvus_connection", verify_milvus_connection()),
|
||||
]
|
||||
|
||||
if args.sample_file:
|
||||
results.append(("real_ingest_pipeline", verify_ingest_pipeline(args.sample_file)))
|
||||
else:
|
||||
logger.info("no sample file provided; skip real ingest check")
|
||||
|
||||
# 输出结果汇总
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("验证结果汇总")
|
||||
logger.info("check summary")
|
||||
logger.info("=" * 60)
|
||||
|
||||
all_passed = True
|
||||
for name, passed in results:
|
||||
status = "✅ 通过" if passed else "❌ 失败"
|
||||
status = "PASS" if passed else "FAIL"
|
||||
logger.info(f"{name}: {status}")
|
||||
if not passed:
|
||||
all_passed = False
|
||||
|
||||
if all_passed:
|
||||
logger.success("\n🎉 所有验证通过!MVP功能正常")
|
||||
logger.success("all executed checks passed")
|
||||
else:
|
||||
logger.warning("\n⚠️ 部分验证失败,请检查配置和环境")
|
||||
logger.warning("some checks failed; inspect environment dependencies")
|
||||
|
||||
return all_passed
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
sys.exit(0 if main() else 1)
|
||||
|
||||
Reference in New Issue
Block a user