update
This commit is contained in:
@@ -38,42 +38,80 @@ info "Step 2/3:初始化 Milvus Collections..."
|
||||
if docker compose ps milvus | grep -q "healthy"; then
|
||||
docker compose run --rm --no-deps compliance-backend \
|
||||
python3 -c "
|
||||
import asyncio
|
||||
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
|
||||
|
||||
connections.connect(host='milvus', port='19530')
|
||||
print('Milvus 连接成功')
|
||||
|
||||
def create_collection(name, description):
|
||||
def create_regulation_chunks():
|
||||
'''
|
||||
regulation_chunks:法规专用 collection,带 sparse 向量和条款号字段。
|
||||
开发阶段每次运行均强制重建(保持 schema 最新)。
|
||||
'''
|
||||
name = 'regulation_chunks'
|
||||
if utility.has_collection(name):
|
||||
utility.drop_collection(name)
|
||||
print(f' 已删除旧 collection {name}(schema 升级)')
|
||||
|
||||
fields = [
|
||||
FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
||||
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
||||
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024),
|
||||
FieldSchema(name='sparse_vec', dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||
FieldSchema(name='clause_no', dtype=DataType.VARCHAR, max_length=64, default_value=''),
|
||||
FieldSchema(name='article_no', dtype=DataType.VARCHAR, max_length=128, default_value=''),
|
||||
FieldSchema(name='regulation_name', dtype=DataType.VARCHAR, max_length=512, default_value=''),
|
||||
FieldSchema(name='metadata', dtype=DataType.JSON),
|
||||
]
|
||||
schema = CollectionSchema(fields, description='法规条款向量库(含稀疏向量和条款号)')
|
||||
col = Collection(name, schema)
|
||||
|
||||
col.create_index('dense_vec', {
|
||||
'metric_type': 'COSINE',
|
||||
'index_type': 'HNSW',
|
||||
'params': {'M': 16, 'efConstruction': 200}
|
||||
})
|
||||
col.create_index('sparse_vec', {
|
||||
'metric_type': 'IP',
|
||||
'index_type': 'SPARSE_INVERTED_INDEX',
|
||||
'params': {'drop_ratio_build': 0.2}
|
||||
})
|
||||
col.load()
|
||||
print(f' Collection {name} 创建完成(dense+sparse 双索引)')
|
||||
|
||||
|
||||
def create_simple_collection(name, description):
|
||||
'''doc_chunks / case_library:通用 collection,仅 dense 向量。'''
|
||||
if utility.has_collection(name):
|
||||
print(f' Collection {name} 已存在,跳过')
|
||||
return
|
||||
|
||||
fields = [
|
||||
FieldSchema(name='id', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
||||
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='pk', dtype=DataType.VARCHAR, is_primary=True, max_length=128),
|
||||
FieldSchema(name='file_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='workspace_id', dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
||||
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024), # BGE-M3 dense
|
||||
FieldSchema(name='metadata', dtype=DataType.JSON),
|
||||
FieldSchema(name='chunk_idx', dtype=DataType.INT64),
|
||||
FieldSchema(name='content', dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name='dense_vec', dtype=DataType.FLOAT_VECTOR, dim=1024),
|
||||
FieldSchema(name='metadata', dtype=DataType.JSON),
|
||||
]
|
||||
schema = CollectionSchema(fields, description=description)
|
||||
col = Collection(name, schema)
|
||||
|
||||
# 创建向量索引(HNSW,适合调研阶段)
|
||||
index_params = {
|
||||
col.create_index('dense_vec', {
|
||||
'metric_type': 'COSINE',
|
||||
'index_type': 'HNSW',
|
||||
'params': {'M': 16, 'efConstruction': 200}
|
||||
}
|
||||
col.create_index('dense_vec', index_params)
|
||||
})
|
||||
col.load()
|
||||
print(f' Collection {name} 创建完成')
|
||||
|
||||
create_collection('regulation_chunks', '法规条款向量库')
|
||||
create_collection('doc_chunks', '企业文档向量库')
|
||||
create_collection('case_library', '行业案例库')
|
||||
|
||||
create_regulation_chunks()
|
||||
create_simple_collection('doc_chunks', '企业文档向量库')
|
||||
create_simple_collection('case_library', '行业案例库')
|
||||
|
||||
print('Milvus 初始化完成')
|
||||
" 2>&1
|
||||
|
||||
Reference in New Issue
Block a user