222 lines
6.4 KiB
Python
222 lines
6.4 KiB
Python
|
|
"""
|
|||
|
|
MVP功能验证脚本
|
|||
|
|
|
|||
|
|
用于验证完整的文档处理流程:
|
|||
|
|
1. PDF/DOCX解析
|
|||
|
|
2. 智能分块
|
|||
|
|
3. 向量嵌入
|
|||
|
|
4. Milvus入库
|
|||
|
|
5. 混合检索
|
|||
|
|
|
|||
|
|
使用方法:
|
|||
|
|
1. 首先启动Milvus: docker-compose up -d
|
|||
|
|
2. 运行此脚本: python verify_mvp.py
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|||
|
|
|
|||
|
|
from loguru import logger
|
|||
|
|
from src.config.logging import setup_logging
|
|||
|
|
from src.services.document_processor import DocumentProcessor, ProcessingResult
|
|||
|
|
from src.services.storage.milvus_client import MilvusClient
|
|||
|
|
from src.config.settings import settings
|
|||
|
|
|
|||
|
|
# 设置日志
|
|||
|
|
setup_logging(level="INFO")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def verify_milvus_connection():
|
|||
|
|
"""验证Milvus连接"""
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
logger.info("Step 1: 验证Milvus连接")
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
|
|||
|
|
client = MilvusClient()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
result = client.connect()
|
|||
|
|
if result:
|
|||
|
|
logger.success("Milvus连接成功")
|
|||
|
|
|
|||
|
|
# 创建Collection
|
|||
|
|
client.create_collection(recreate=True)
|
|||
|
|
stats = client.get_collection_stats()
|
|||
|
|
logger.info(f"Collection信息: {stats}")
|
|||
|
|
|
|||
|
|
client.disconnect()
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
logger.error("Milvus连接失败,请检查docker-compose是否启动")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Milvus连接异常: {e}")
|
|||
|
|
logger.info("请先启动Milvus: cd docker && docker-compose up -d")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def verify_embedding_model():
|
|||
|
|
"""验证嵌入模型"""
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
logger.info("Step 2: 验证BGE-M3嵌入模型")
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from src.services.embedding.bge_m3_embedder import BGEM3Embedder
|
|||
|
|
|
|||
|
|
embedder = BGEM3Embedder()
|
|||
|
|
logger.success("嵌入模型加载成功")
|
|||
|
|
|
|||
|
|
# 测试嵌入
|
|||
|
|
test_text = "这是一条测试文本,用于验证嵌入模型功能"
|
|||
|
|
result = embedder.embed_single(test_text)
|
|||
|
|
|
|||
|
|
logger.info(f"Dense向量维度: {len(result['dense'])}")
|
|||
|
|
logger.info(f"Sparse向量词数: {len(result['sparse'])}")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"嵌入模型验证失败: {e}")
|
|||
|
|
logger.info("请确保已安装FlagEmbedding: pip install FlagEmbedding")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def verify_sample_document():
|
|||
|
|
"""验证示例文档处理"""
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
logger.info("Step 3: 验证文档处理流程")
|
|||
|
|
logger.info("=" * 50)
|
|||
|
|
|
|||
|
|
# 使用内置的示例文本(无需外部文件)
|
|||
|
|
sample_text = """
|
|||
|
|
# GB 7258-2017 机动车运行安全技术条件
|
|||
|
|
|
|||
|
|
第一章 范围
|
|||
|
|
|
|||
|
|
第一条 本标准规定了机动车运行安全技术条件,适用于在我国道路上行驶的所有机动车。
|
|||
|
|
|
|||
|
|
第二条 本标准包括整车、发动机、传动系、行驶系、制动系、照明与信号装置等技术要求。
|
|||
|
|
|
|||
|
|
第二章 术语和定义
|
|||
|
|
|
|||
|
|
第三条 下列术语和定义适用于本标准:
|
|||
|
|
|
|||
|
|
(一)机动车:以动力装置驱动或者牵引,上道路行驶的供人员乘用或者用于运送物品的轮式车辆。
|
|||
|
|
|
|||
|
|
(二)整车产品:完整的机动车产品,包括所有必要的部件和系统。
|
|||
|
|
|
|||
|
|
第三章 整车技术要求
|
|||
|
|
|
|||
|
|
第四条 机动车整车应满足以下基本技术要求:
|
|||
|
|
|
|||
|
|
1. 车辆外廓尺寸应符合规定限值;
|
|||
|
|
2. 车辆应具有唯一的产品标识;
|
|||
|
|
3. 车辆结构应安全可靠,各部件连接牢固。
|
|||
|
|
|
|||
|
|
第五条 车辆应配备必要的安全装置,包括:
|
|||
|
|
- 制动系统
|
|||
|
|
- 照明与信号装置
|
|||
|
|
- 安全带
|
|||
|
|
- 灭火器
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from src.services.embedding.text_chunker import RegulationChunker
|
|||
|
|
from src.services.embedding.bge_m3_embedder import BGEM3Embedder
|
|||
|
|
from src.services.storage.milvus_client import MilvusClient
|
|||
|
|
|
|||
|
|
# 1. 分块
|
|||
|
|
logger.info("测试分块...")
|
|||
|
|
chunker = RegulationChunker(chunk_size=256)
|
|||
|
|
chunks = chunker.chunk_document(
|
|||
|
|
sample_text,
|
|||
|
|
doc_id="gb7258_test",
|
|||
|
|
doc_name="GB 7258-2017 测试",
|
|||
|
|
regulation_type="车辆安全"
|
|||
|
|
)
|
|||
|
|
logger.success(f"分块完成,共{len(chunks)}个chunk")
|
|||
|
|
|
|||
|
|
# 2. 嵌入
|
|||
|
|
logger.info("测试嵌入...")
|
|||
|
|
embedder = BGEM3Embedder()
|
|||
|
|
embeddings = embedder.embed([c.content for c in chunks])
|
|||
|
|
logger.success(f"嵌入完成,向量数: {len(embeddings.dense_embeddings)}")
|
|||
|
|
|
|||
|
|
# 3. 入库
|
|||
|
|
logger.info("测试入库...")
|
|||
|
|
client = MilvusClient()
|
|||
|
|
client.connect()
|
|||
|
|
client.create_collection(recreate=False)
|
|||
|
|
client.load_collection()
|
|||
|
|
|
|||
|
|
inserted_ids = client.insert_chunks(chunks, embeddings)
|
|||
|
|
logger.success(f"入库完成,共{len(inserted_ids)}条记录")
|
|||
|
|
|
|||
|
|
# 4. 检索
|
|||
|
|
logger.info("测试检索...")
|
|||
|
|
query = "机动车安全技术要求"
|
|||
|
|
query_emb = embedder.embed_single(query)
|
|||
|
|
|
|||
|
|
results = client.hybrid_search(
|
|||
|
|
query_dense=query_emb['dense'].tolist(),
|
|||
|
|
query_sparse=query_emb['sparse'],
|
|||
|
|
top_k=3
|
|||
|
|
)
|
|||
|
|
logger.success(f"检索完成,返回{len(results)}条结果")
|
|||
|
|
|
|||
|
|
for i, r in enumerate(results):
|
|||
|
|
logger.info(f"结果{i+1}: 分数={r.score:.4f}, 内容={r.content[:50]}...")
|
|||
|
|
|
|||
|
|
client.disconnect()
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"文档处理验证失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主验证流程"""
|
|||
|
|
logger.info("\n" + "=" * 60)
|
|||
|
|
logger.info("AI+合规智能中枢 MVP功能验证")
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
|
|||
|
|
results = []
|
|||
|
|
|
|||
|
|
# 1. Milvus连接验证
|
|||
|
|
results.append(("Milvus连接", verify_milvus_connection()))
|
|||
|
|
|
|||
|
|
# 2. 嵌入模型验证
|
|||
|
|
results.append(("嵌入模型", verify_embedding_model()))
|
|||
|
|
|
|||
|
|
# 3. 文档处理验证
|
|||
|
|
results.append(("文档处理", verify_sample_document()))
|
|||
|
|
|
|||
|
|
# 输出结果汇总
|
|||
|
|
logger.info("\n" + "=" * 60)
|
|||
|
|
logger.info("验证结果汇总")
|
|||
|
|
logger.info("=" * 60)
|
|||
|
|
|
|||
|
|
all_passed = True
|
|||
|
|
for name, passed in results:
|
|||
|
|
status = "✅ 通过" if passed else "❌ 失败"
|
|||
|
|
logger.info(f"{name}: {status}")
|
|||
|
|
if not passed:
|
|||
|
|
all_passed = False
|
|||
|
|
|
|||
|
|
if all_passed:
|
|||
|
|
logger.success("\n🎉 所有验证通过!MVP功能正常")
|
|||
|
|
else:
|
|||
|
|
logger.warning("\n⚠️ 部分验证失败,请检查配置和环境")
|
|||
|
|
|
|||
|
|
return all_passed
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
success = main()
|
|||
|
|
sys.exit(0 if success else 1)
|