#!/usr/bin/env python3 """ 清理脚本 - 清空Milvus向量数据库和MinIO对象存储中的所有文档数据 使用方法: python scripts/clear_all.py # 清空所有数据 python scripts/clear_all.py --milvus # 仅清空Milvus python scripts/clear_all.py --minio # 仅清空MinIO python scripts/clear_all.py --dry-run # 仅查看数据统计,不删除 """ import argparse import sys import os # 添加项目路径 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from src.config.settings import settings from loguru import logger def clear_milvus(dry_run: bool = False): """清空Milvus向量数据库""" try: from pymilvus import connections, Collection, utility logger.info(f"连接Milvus: {settings.milvus_host}:{settings.milvus_port}") connections.connect( alias="default", host=settings.milvus_host, port=settings.milvus_port ) collection_name = settings.milvus_collection # 检查collection是否存在 if utility.has_collection(collection_name): collection = Collection(collection_name) if dry_run: # 仅统计数量 collection.load() count = collection.num_entities logger.info(f"[DRY-RUN] Milvus collection '{collection_name}' 包含 {count} 条记录") return count # 删除collection(数据会全部清空) logger.info(f"删除collection: {collection_name}") utility.drop_collection(collection_name) logger.success(f"Milvus collection '{collection_name}' 已删除") # 重新创建collection(可选) logger.info("重新创建collection...") from src.services.storage.milvus_client import MilvusClient client = MilvusClient() client.connect() client.create_collection(recreate=True) client.disconnect() logger.success("Milvus collection已重新创建") return 0 else: logger.info(f"Milvus collection '{collection_name}' 不存在") return 0 except Exception as e: logger.error(f"清理Milvus失败: {e}") return -1 finally: try: connections.disconnect("default") except: pass def clear_minio(dry_run: bool = False): """清空MinIO对象存储""" try: from minio import Minio from minio.error import S3Error logger.info(f"连接MinIO: {settings.minio_endpoint}") client = Minio( settings.minio_endpoint, access_key=settings.minio_access_key, secret_key=settings.minio_secret_key, secure=settings.minio_secure ) bucket = settings.minio_bucket # 检查bucket是否存在 if not client.bucket_exists(bucket): logger.info(f"MinIO bucket '{bucket}' 不存在") return 0 # 列出所有对象 objects = list(client.list_objects(bucket)) count = len(objects) if dry_run: logger.info(f"[DRY-RUN] MinIO bucket '{bucket}' 包含 {count} 个对象:") for obj in objects: logger.info(f" - {obj.object_name} ({obj.size} bytes)") return count # 删除所有对象 logger.info(f"清空bucket '{bucket}' 中的 {count} 个对象...") deleted = 0 for obj in objects: try: client.remove_object(bucket, obj.object_name) deleted += 1 logger.info(f" 已删除: {obj.object_name}") except S3Error as e: logger.warning(f" 删除失败: {obj.object_name} - {e}") logger.success(f"MinIO已清空,删除 {deleted} 个对象") return deleted except Exception as e: logger.error(f"清理MinIO失败: {e}") return -1 def main(): parser = argparse.ArgumentParser( description="清空Milvus和MinIO中的所有文档数据" ) parser.add_argument( "--milvus", action="store_true", help="仅清空Milvus向量数据库" ) parser.add_argument( "--minio", action="store_true", help="仅清空MinIO对象存储" ) parser.add_argument( "--dry-run", action="store_true", help="仅查看数据统计,不执行删除" ) args = parser.parse_args() # 配置日志格式 logger.remove() logger.add(sys.stdout, format="{time:HH:mm:ss} | {level} | {message}") logger.info("=" * 50) logger.info("文档数据清理脚本") logger.info("=" * 50) if args.dry_run: logger.warning("[DRY-RUN 模式] 仅统计,不删除数据") results = {} # 清理Milvus if args.milvus or (not args.milvus and not args.minio): logger.info("\n[1] 清理Milvus向量数据库") results["milvus"] = clear_milvus(dry_run=args.dry_run) # 清理MinIO if args.minio or (not args.milvus and not args.minio): logger.info("\n[2] 清理MinIO对象存储") results["minio"] = clear_minio(dry_run=args.dry_run) # 输出结果摘要 logger.info("\n" + "=" * 50) logger.info("清理结果摘要:") for name, count in results.items(): if count >= 0: status = "已清空" if not args.dry_run else "统计完成" logger.info(f" {name}: {status} ({count} 条/个)") else: logger.error(f" {name}: 清理失败") logger.info("=" * 50) # 返回状态码 if all(c >= 0 for c in results.values()): logger.success("清理完成!") return 0 else: logger.error("清理失败,请检查错误信息") return 1 if __name__ == "__main__": sys.exit(main())