2026-04-23 09:58:47 +08:00
|
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
|
# ══════════════════════════════════════════════════
|
|
|
|
|
|
# 05_init_db.sh
|
|
|
|
|
|
# 初始化数据库:PostgreSQL Schema + Milvus Collections + Neo4j Constraints
|
|
|
|
|
|
# 用法:bash scripts/05_init_db.sh
|
|
|
|
|
|
# 前提:postgres / milvus / neo4j 已运行且健康
|
|
|
|
|
|
# ══════════════════════════════════════════════════
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
|
|
|
|
cd "$PROJECT_DIR"
|
|
|
|
|
|
|
|
|
|
|
|
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
|
|
|
|
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
|
|
|
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
|
|
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
|
|
|
|
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
|
|
|
|
|
|
|
|
|
|
|
source .env 2>/dev/null || true
|
|
|
|
|
|
POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-compliance123}
|
|
|
|
|
|
NEO4J_PASSWORD=${NEO4J_PASSWORD:-neo4j123}
|
|
|
|
|
|
|
|
|
|
|
|
# ── Step 1:PostgreSQL Schema ───────────────────
|
|
|
|
|
|
info "Step 1/3:初始化 PostgreSQL Schema..."
|
|
|
|
|
|
if docker compose ps postgres | grep -q "healthy"; then
|
|
|
|
|
|
docker compose exec -T postgres psql \
|
|
|
|
|
|
-U compliance -d compliance_db \
|
|
|
|
|
|
-f /docker-entrypoint-initdb.d/01_init_schema.sql \
|
|
|
|
|
|
2>&1 | tail -5 || warn "SQL 可能部分已存在(IF NOT EXISTS),这是正常的"
|
|
|
|
|
|
ok "PostgreSQL Schema 初始化完成"
|
|
|
|
|
|
else
|
|
|
|
|
|
error "PostgreSQL 未运行,请先执行:bash scripts/03_start_infra.sh"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# ── Step 2:Milvus Collections ──────────────────
|
|
|
|
|
|
info "Step 2/3:初始化 Milvus Collections..."
|
|
|
|
|
|
if docker compose ps milvus | grep -q "healthy"; then
|
|
|
|
|
|
docker compose run --rm --no-deps compliance-backend \
|
|
|
|
|
|
python3 -c "
|
|
|
|
|
|
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
|
|
|
|
|
|
|
|
|
|
|
|
connections.connect(host='milvus', port='19530')
|
|
|
|
|
|
print('Milvus 连接成功')
|
|
|
|
|
|
|
2026-04-23 14:50:24 +08:00
|
|
|
|
def create_regulation_chunks():
|
|
|
|
|
|
'''
|
|
|
|
|
|
regulation_chunks:法规专用 collection,带 sparse 向量和条款号字段。
|
|
|
|
|
|
开发阶段每次运行均强制重建(保持 schema 最新)。
|
|
|
|
|
|
'''
|
|
|
|
|
|
name = 'regulation_chunks'
|
|
|
|
|
|
if utility.has_collection(name):
|
|
|
|
|
|
utility.drop_collection(name)
|
|
|
|
|
|
print(f' 已删除旧 collection {name}(schema 升级)')
|
|
|
|
|
|
|
|
|
|
|
|
fields = [
|
|
|
|
|
|
FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
|
|
|
|
|
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
|
|
|
|
|
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
|
|
|
|
|
|
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
|
|
|
|
|
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
|
|
|
|
|
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024),
|
|
|
|
|
|
FieldSchema(name='sparse_vec', dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
|
|
|
|
FieldSchema(name='clause_no', dtype=DataType.VARCHAR, max_length=64, default_value=''),
|
|
|
|
|
|
FieldSchema(name='article_no', dtype=DataType.VARCHAR, max_length=128, default_value=''),
|
|
|
|
|
|
FieldSchema(name='regulation_name', dtype=DataType.VARCHAR, max_length=512, default_value=''),
|
|
|
|
|
|
FieldSchema(name='metadata', dtype=DataType.JSON),
|
|
|
|
|
|
]
|
|
|
|
|
|
schema = CollectionSchema(fields, description='法规条款向量库(含稀疏向量和条款号)')
|
|
|
|
|
|
col = Collection(name, schema)
|
|
|
|
|
|
|
|
|
|
|
|
col.create_index('dense_vec', {
|
|
|
|
|
|
'metric_type': 'COSINE',
|
|
|
|
|
|
'index_type': 'HNSW',
|
|
|
|
|
|
'params': {'M': 16, 'efConstruction': 200}
|
|
|
|
|
|
})
|
|
|
|
|
|
col.create_index('sparse_vec', {
|
|
|
|
|
|
'metric_type': 'IP',
|
|
|
|
|
|
'index_type': 'SPARSE_INVERTED_INDEX',
|
|
|
|
|
|
'params': {'drop_ratio_build': 0.2}
|
|
|
|
|
|
})
|
|
|
|
|
|
col.load()
|
|
|
|
|
|
print(f' Collection {name} 创建完成(dense+sparse 双索引)')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_simple_collection(name, description):
|
|
|
|
|
|
'''doc_chunks / case_library:通用 collection,仅 dense 向量。'''
|
2026-04-23 09:58:47 +08:00
|
|
|
|
if utility.has_collection(name):
|
|
|
|
|
|
print(f' Collection {name} 已存在,跳过')
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
fields = [
|
2026-04-23 14:50:24 +08:00
|
|
|
|
FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
|
|
|
|
|
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
2026-04-23 09:58:47 +08:00
|
|
|
|
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
|
2026-04-23 14:50:24 +08:00
|
|
|
|
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
|
|
|
|
|
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
|
|
|
|
|
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024),
|
|
|
|
|
|
FieldSchema(name='metadata', dtype=DataType.JSON),
|
2026-04-23 09:58:47 +08:00
|
|
|
|
]
|
|
|
|
|
|
schema = CollectionSchema(fields, description=description)
|
|
|
|
|
|
col = Collection(name, schema)
|
2026-04-23 14:50:24 +08:00
|
|
|
|
col.create_index('dense_vec', {
|
2026-04-23 09:58:47 +08:00
|
|
|
|
'metric_type': 'COSINE',
|
|
|
|
|
|
'index_type': 'HNSW',
|
|
|
|
|
|
'params': {'M': 16, 'efConstruction': 200}
|
2026-04-23 14:50:24 +08:00
|
|
|
|
})
|
2026-04-23 09:58:47 +08:00
|
|
|
|
col.load()
|
|
|
|
|
|
print(f' Collection {name} 创建完成')
|
|
|
|
|
|
|
2026-04-23 14:50:24 +08:00
|
|
|
|
|
|
|
|
|
|
create_regulation_chunks()
|
|
|
|
|
|
create_simple_collection('doc_chunks', '企业文档向量库')
|
|
|
|
|
|
create_simple_collection('case_library', '行业案例库')
|
2026-04-23 09:58:47 +08:00
|
|
|
|
|
|
|
|
|
|
print('Milvus 初始化完成')
|
|
|
|
|
|
" 2>&1
|
|
|
|
|
|
ok "Milvus Collections 初始化完成"
|
|
|
|
|
|
else
|
|
|
|
|
|
error "Milvus 未运行,请先执行:bash scripts/03_start_infra.sh"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# ── Step 3:Neo4j 约束和索引 ────────────────────
|
|
|
|
|
|
info "Step 3/3:初始化 Neo4j 约束和索引..."
|
|
|
|
|
|
sleep 5 # Neo4j 可能还在预热
|
|
|
|
|
|
|
|
|
|
|
|
docker compose exec -T neo4j cypher-shell \
|
|
|
|
|
|
-u neo4j -p "$NEO4J_PASSWORD" \
|
|
|
|
|
|
--format plain <<'CYPHER'
|
|
|
|
|
|
// 节点约束(唯一性)
|
|
|
|
|
|
CREATE CONSTRAINT regulation_id IF NOT EXISTS
|
|
|
|
|
|
FOR (r:Regulation) REQUIRE r.id IS UNIQUE;
|
|
|
|
|
|
CREATE CONSTRAINT clause_id IF NOT EXISTS
|
|
|
|
|
|
FOR (c:Clause) REQUIRE c.id IS UNIQUE;
|
|
|
|
|
|
CREATE CONSTRAINT obligation_id IF NOT EXISTS
|
|
|
|
|
|
FOR (o:Obligation) REQUIRE o.id IS UNIQUE;
|
|
|
|
|
|
|
|
|
|
|
|
// 全文索引(模糊查询)
|
|
|
|
|
|
CREATE FULLTEXT INDEX regulation_fulltext IF NOT EXISTS
|
|
|
|
|
|
FOR (r:Regulation) ON EACH [r.title, r.code, r.domain];
|
|
|
|
|
|
CREATE FULLTEXT INDEX clause_fulltext IF NOT EXISTS
|
|
|
|
|
|
FOR (c:Clause) ON EACH [c.content, c.title];
|
|
|
|
|
|
|
|
|
|
|
|
// 插入示例节点(验证连通性)
|
|
|
|
|
|
MERGE (d:Domain {name: 'vehicle_safety', label: '车辆安全法规'});
|
|
|
|
|
|
MERGE (d:Domain {name: 'data_security', label: '数据安全法规'});
|
|
|
|
|
|
MERGE (d:Domain {name: 'ehs', label: 'EHS安全法规'});
|
|
|
|
|
|
MERGE (d:Domain {name: 'carbon', label: '碳排放法规'});
|
|
|
|
|
|
RETURN '初始化完成' AS result;
|
|
|
|
|
|
CYPHER
|
|
|
|
|
|
ok "Neo4j 约束和索引初始化完成"
|
|
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
|
|
|
|
|
echo -e "${GREEN} 数据库初始化完成!${NC}"
|
|
|
|
|
|
echo -e "${GREEN}══════════════════════════════════════════${NC}"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo " PostgreSQL: 所有表已创建"
|
|
|
|
|
|
echo " Milvus: regulation_chunks / doc_chunks / case_library"
|
|
|
|
|
|
echo " Neo4j: 约束 + 全文索引 + 基础域节点"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "下一步:bash scripts/06_start_all.sh"
|