Files
AIRegulation-Deployment/scripts/05_init_db.sh
2026-04-23 14:50:24 +08:00

163 lines
7.2 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# ══════════════════════════════════════════════════
# 05_init_db.sh
# 初始化数据库PostgreSQL Schema + Milvus Collections + Neo4j Constraints
# 用法bash scripts/05_init_db.sh
# 前提postgres / milvus / neo4j 已运行且健康
# ══════════════════════════════════════════════════
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_DIR"
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
source .env 2>/dev/null || true
POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-compliance123}
NEO4J_PASSWORD=${NEO4J_PASSWORD:-neo4j123}
# ── Step 1PostgreSQL Schema ───────────────────
info "Step 1/3初始化 PostgreSQL Schema..."
if docker compose ps postgres | grep -q "healthy"; then
docker compose exec -T postgres psql \
-U compliance -d compliance_db \
-f /docker-entrypoint-initdb.d/01_init_schema.sql \
2>&1 | tail -5 || warn "SQL 可能部分已存在IF NOT EXISTS这是正常的"
ok "PostgreSQL Schema 初始化完成"
else
error "PostgreSQL 未运行请先执行bash scripts/03_start_infra.sh"
fi
# ── Step 2Milvus Collections ──────────────────
info "Step 2/3初始化 Milvus Collections..."
if docker compose ps milvus | grep -q "healthy"; then
docker compose run --rm --no-deps compliance-backend \
python3 -c "
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
connections.connect(host='milvus', port='19530')
print('Milvus 连接成功')
def create_regulation_chunks():
'''
regulation_chunks法规专用 collection带 sparse 向量和条款号字段。
开发阶段每次运行均强制重建(保持 schema 最新)。
'''
name = 'regulation_chunks'
if utility.has_collection(name):
utility.drop_collection(name)
print(f' 已删除旧 collection {name}schema 升级)')
fields = [
FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024),
FieldSchema(name='sparse_vec', dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name='clause_no', dtype=DataType.VARCHAR, max_length=64, default_value=''),
FieldSchema(name='article_no', dtype=DataType.VARCHAR, max_length=128, default_value=''),
FieldSchema(name='regulation_name', dtype=DataType.VARCHAR, max_length=512, default_value=''),
FieldSchema(name='metadata', dtype=DataType.JSON),
]
schema = CollectionSchema(fields, description='法规条款向量库(含稀疏向量和条款号)')
col = Collection(name, schema)
col.create_index('dense_vec', {
'metric_type': 'COSINE',
'index_type': 'HNSW',
'params': {'M': 16, 'efConstruction': 200}
})
col.create_index('sparse_vec', {
'metric_type': 'IP',
'index_type': 'SPARSE_INVERTED_INDEX',
'params': {'drop_ratio_build': 0.2}
})
col.load()
print(f' Collection {name} 创建完成dense+sparse 双索引)')
def create_simple_collection(name, description):
'''doc_chunks / case_library通用 collection仅 dense 向量。'''
if utility.has_collection(name):
print(f' Collection {name} 已存在,跳过')
return
fields = [
FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024),
FieldSchema(name='metadata', dtype=DataType.JSON),
]
schema = CollectionSchema(fields, description=description)
col = Collection(name, schema)
col.create_index('dense_vec', {
'metric_type': 'COSINE',
'index_type': 'HNSW',
'params': {'M': 16, 'efConstruction': 200}
})
col.load()
print(f' Collection {name} 创建完成')
create_regulation_chunks()
create_simple_collection('doc_chunks', '企业文档向量库')
create_simple_collection('case_library', '行业案例库')
print('Milvus 初始化完成')
" 2>&1
ok "Milvus Collections 初始化完成"
else
error "Milvus 未运行请先执行bash scripts/03_start_infra.sh"
fi
# ── Step 3Neo4j 约束和索引 ────────────────────
info "Step 3/3初始化 Neo4j 约束和索引..."
sleep 5 # Neo4j 可能还在预热
docker compose exec -T neo4j cypher-shell \
-u neo4j -p "$NEO4J_PASSWORD" \
--format plain <<'CYPHER'
// 节点约束(唯一性)
CREATE CONSTRAINT regulation_id IF NOT EXISTS
FOR (r:Regulation) REQUIRE r.id IS UNIQUE;
CREATE CONSTRAINT clause_id IF NOT EXISTS
FOR (c:Clause) REQUIRE c.id IS UNIQUE;
CREATE CONSTRAINT obligation_id IF NOT EXISTS
FOR (o:Obligation) REQUIRE o.id IS UNIQUE;
// 全文索引(模糊查询)
CREATE FULLTEXT INDEX regulation_fulltext IF NOT EXISTS
FOR (r:Regulation) ON EACH [r.title, r.code, r.domain];
CREATE FULLTEXT INDEX clause_fulltext IF NOT EXISTS
FOR (c:Clause) ON EACH [c.content, c.title];
// 插入示例节点(验证连通性)
MERGE (d:Domain {name: 'vehicle_safety', label: '车辆安全法规'});
MERGE (d:Domain {name: 'data_security', label: '数据安全法规'});
MERGE (d:Domain {name: 'ehs', label: 'EHS安全法规'});
MERGE (d:Domain {name: 'carbon', label: '碳排放法规'});
RETURN '初始化完成' AS result;
CYPHER
ok "Neo4j 约束和索引初始化完成"
echo ""
echo -e "${GREEN}══════════════════════════════════════════${NC}"
echo -e "${GREEN} 数据库初始化完成!${NC}"
echo -e "${GREEN}══════════════════════════════════════════${NC}"
echo ""
echo " PostgreSQL: 所有表已创建"
echo " Milvus: regulation_chunks / doc_chunks / case_library"
echo " Neo4j: 约束 + 全文索引 + 基础域节点"
echo ""
echo "下一步bash scripts/06_start_all.sh"