Fix SSE route dependency and align architecture docs

This commit is contained in:
ash66
2026-05-18 16:32:42 +08:00
parent 86b9ac806a
commit 3f69cad404
149 changed files with 4786 additions and 5957 deletions

View File

@@ -1,223 +1,211 @@
"""
MVP功能验证脚本
Post-migration backend smoke checks.
用于验证完整的文档处理流程:
1. PDF/DOCX解析
2. 智能分块
3. 向量嵌入
4. Milvus入库
5. 混合检索
使用方法:
1. 首先启动Milvus: docker-compose up -d
2. 运行此脚本: python verify_mvp.py
Purpose:
1. Verify the new architecture modules can be imported
2. Verify migration-critical config matches the RFC
3. Verify external dependencies when they are available
4. Optionally verify the real ingest path with a sample document
"""
import os
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "backend"))
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "backend"))
from loguru import logger
from app.config.logging import setup_logging
from app.services.document_processor import DocumentProcessor, ProcessingResult
from app.services.storage.milvus_client import MilvusClient
from app.config.settings import settings
# 设置日志
from app.config.logging import setup_logging
from app.config.settings import settings
from app.shared.bootstrap import (
get_document_command_service,
get_retrieval_service,
get_vector_index,
)
setup_logging(level="INFO")
def verify_milvus_connection():
"""验证Milvus连接"""
logger.info("=" * 50)
logger.info("Step 1: 验证Milvus连接")
logger.info("=" * 50)
client = MilvusClient()
try:
result = client.connect()
if result:
logger.success("Milvus连接成功")
# 创建Collection
client.create_collection(recreate=True)
stats = client.get_collection_stats()
logger.info(f"Collection信息: {stats}")
client.disconnect()
return True
else:
logger.error("Milvus连接失败请检查docker-compose是否启动")
return False
except Exception as e:
logger.error(f"Milvus连接异常: {e}")
logger.info("请先启动Milvus: cd docker && docker-compose up -d")
return False
def verify_embedding_model():
"""验证嵌入模型"""
logger.info("=" * 50)
logger.info("Step 2: 验证BGE-M3嵌入模型")
logger.info("=" * 50)
try:
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
embedder = BGEM3Embedder()
logger.success("嵌入模型加载成功")
# 测试嵌入
test_text = "这是一条测试文本,用于验证嵌入模型功能"
result = embedder.embed_single(test_text)
logger.info(f"Dense向量维度: {len(result['dense'])}")
logger.info(f"Sparse向量词数: {len(result['sparse'])}")
return True
except Exception as e:
logger.error(f"嵌入模型验证失败: {e}")
logger.info("请确保已安装FlagEmbedding: pip install FlagEmbedding")
return False
def verify_sample_document():
"""验证示例文档处理"""
logger.info("=" * 50)
logger.info("Step 3: 验证文档处理流程")
logger.info("=" * 50)
# 使用内置的示例文本(无需外部文件)
sample_text = """
# GB 7258-2017 机动车运行安全技术条件
第一章 范围
第一条 本标准规定了机动车运行安全技术条件,适用于在我国道路上行驶的所有机动车。
第二条 本标准包括整车、发动机、传动系、行驶系、制动系、照明与信号装置等技术要求。
第二章 术语和定义
第三条 下列术语和定义适用于本标准:
(一)机动车:以动力装置驱动或者牵引,上道路行驶的供人员乘用或者用于运送物品的轮式车辆。
(二)整车产品:完整的机动车产品,包括所有必要的部件和系统。
第三章 整车技术要求
第四条 机动车整车应满足以下基本技术要求:
1. 车辆外廓尺寸应符合规定限值;
2. 车辆应具有唯一的产品标识;
3. 车辆结构应安全可靠,各部件连接牢固。
第五条 车辆应配备必要的安全装置,包括:
- 制动系统
- 照明与信号装置
- 安全带
- 灭火器
"""
try:
from app.services.embedding.text_chunker import RegulationChunker
from app.services.embedding.bge_m3_embedder import BGEM3Embedder
from app.services.storage.milvus_client import MilvusClient
# 1. 分块
logger.info("测试分块...")
chunker = RegulationChunker(chunk_size=256)
chunks = chunker.chunk_document(
sample_text,
doc_id="gb7258_test",
doc_name="GB 7258-2017 测试",
regulation_type="车辆安全"
)
logger.success(f"分块完成,共{len(chunks)}个chunk")
# 2. 嵌入
logger.info("测试嵌入...")
embedder = BGEM3Embedder()
embeddings = embedder.embed([c.content for c in chunks])
logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}")
# 3. 入库
logger.info("测试入库...")
client = MilvusClient()
client.connect()
client.create_collection(recreate=False)
client.load_collection()
inserted_ids = client.insert_chunks(chunks, embeddings)
logger.success(f"入库完成,共{len(inserted_ids)}条记录")
# 4. 检索
logger.info("测试检索...")
query = "机动车安全技术要求"
query_emb = embedder.embed_single(query)
results = client.hybrid_search(
query_dense=query_emb['dense'].tolist(),
query_sparse=query_emb['sparse'],
top_k=3
)
logger.success(f"检索完成,返回{len(results)}条结果")
for i, r in enumerate(results):
logger.info(f"结果{i+1}: 分数={r.score:.4f}, 内容={r.content[:50]}...")
client.disconnect()
return True
except Exception as e:
logger.error(f"文档处理验证失败: {e}")
return False
def main():
"""主验证流程"""
logger.info("\n" + "=" * 60)
logger.info("AI+合规智能中枢 MVP功能验证")
def verify_service_wiring() -> bool:
"""Verify the new module layout and service entrypoints can be imported."""
logger.info("=" * 60)
logger.info("Step 1: verify module wiring")
logger.info("=" * 60)
results = []
try:
from app.api.main import app
from app.application.agent import AgentConversationService
from app.application.documents import DocumentCommandService, DocumentQueryService
from app.application.knowledge import KnowledgeRetrievalService
from app.shared import bootstrap
# 1. Milvus连接验证
results.append(("Milvus连接", verify_milvus_connection()))
assert app is not None
assert DocumentCommandService is not None
assert DocumentQueryService is not None
assert KnowledgeRetrievalService is not None
assert AgentConversationService is not None
assert bootstrap is not None
logger.success("module wiring ok")
return True
except Exception as exc:
logger.error(f"module wiring failed: {exc}")
return False
# 2. 嵌入模型验证
results.append(("嵌入模型", verify_embedding_model()))
# 3. 文档处理验证
results.append(("文档处理", verify_sample_document()))
def verify_migration_config() -> bool:
"""Verify migration-critical config values."""
logger.info("=" * 60)
logger.info("Step 2: verify migration config")
logger.info("=" * 60)
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1536
assert settings.milvus_collection == "regulations_dense_1536"
logger.info(f"embedding_model={settings.embedding_model}")
logger.info(f"embedding_dim={settings.embedding_dim}")
logger.info(f"milvus_collection={settings.milvus_collection}")
logger.success("migration config ok")
return True
except Exception as exc:
logger.error(f"migration config mismatch: {exc}")
return False
def verify_minio_connection() -> bool:
"""Verify MinIO connectivity for the binary store path."""
logger.info("=" * 60)
logger.info("Step 3: verify MinIO connection")
logger.info("=" * 60)
try:
binary_store = get_document_command_service().binary_store
assert binary_store is not None
logger.success("MinIO connection ok")
return True
except Exception as exc:
logger.error(f"MinIO connection failed: {exc}")
logger.info("start MinIO first or update .env storage settings")
return False
def verify_milvus_connection() -> bool:
"""Verify dense-only Milvus adapter connectivity."""
logger.info("=" * 60)
logger.info("Step 4: verify Milvus connection")
logger.info("=" * 60)
try:
health = get_vector_index().health()
logger.info(f"Milvus health: {health}")
logger.success("Milvus connection ok")
return True
except Exception as exc:
logger.error(f"Milvus connection failed: {exc}")
logger.info("start Milvus first or update .env vector settings")
return False
def verify_ingest_pipeline(sample_file: Path) -> bool:
"""Verify upload -> parse -> embed -> index using a real file."""
logger.info("=" * 60)
logger.info("Step 5: verify real ingest pipeline")
logger.info("=" * 60)
if not sample_file.exists():
logger.error(f"sample file not found: {sample_file}")
return False
if sample_file.suffix.lower() not in {".pdf", ".doc", ".docx"}:
logger.error("sample file must be PDF, DOC, or DOCX")
return False
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
logger.error("missing Aliyun parser credentials")
return False
try:
result = get_document_command_service().upload_and_process(
file_name=sample_file.name,
content=sample_file.read_bytes(),
content_type=_guess_content_type(sample_file),
doc_name=sample_file.stem,
regulation_type="smoke-test",
version="migration",
generate_summary=False,
)
logger.info(f"process result: doc_id={result.doc_id}, status={result.status}, chunks={result.num_chunks}")
if result.status != "indexed":
logger.error(f"ingest failed: {result.message}")
return False
retrieval_results = get_retrieval_service().retrieve(
query=sample_file.stem,
top_k=3,
filters=f'doc_id == "{result.doc_id}"',
)
logger.info(f"retrieval count: {len(retrieval_results)}")
logger.success("real ingest pipeline ok")
return True
except Exception as exc:
logger.error(f"real ingest pipeline failed: {exc}")
return False
def _guess_content_type(sample_file: Path) -> str:
suffix = sample_file.suffix.lower()
if suffix == ".pdf":
return "application/pdf"
if suffix == ".doc":
return "application/msword"
if suffix == ".docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return "application/octet-stream"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Verify the migrated backend path")
parser.add_argument("--sample-file", type=Path, help="Optional PDF/DOC/DOCX for real ingest verification")
return parser.parse_args()
def main() -> bool:
args = parse_args()
results = [
("module_wiring", verify_service_wiring()),
("migration_config", verify_migration_config()),
("minio_connection", verify_minio_connection()),
("milvus_connection", verify_milvus_connection()),
]
if args.sample_file:
results.append(("real_ingest_pipeline", verify_ingest_pipeline(args.sample_file)))
else:
logger.info("no sample file provided; skip real ingest check")
# 输出结果汇总
logger.info("\n" + "=" * 60)
logger.info("验证结果汇总")
logger.info("check summary")
logger.info("=" * 60)
all_passed = True
for name, passed in results:
status = "✅ 通过" if passed else "❌ 失败"
status = "PASS" if passed else "FAIL"
logger.info(f"{name}: {status}")
if not passed:
all_passed = False
if all_passed:
logger.success("\n🎉 所有验证通过MVP功能正常")
logger.success("all executed checks passed")
else:
logger.warning("\n⚠️ 部分验证失败,请检查配置和环境")
logger.warning("some checks failed; inspect environment dependencies")
return all_passed
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)
sys.exit(0 if main() else 1)