486 lines
16 KiB
Python
486 lines
16 KiB
Python
|
|
# src/services/storage/milvus_client.py
|
|||
|
|
"""Milvus向量数据库客户端 - 存储与检索服务"""
|
|||
|
|
|
|||
|
|
from pymilvus import (
|
|||
|
|
connections,
|
|||
|
|
Collection,
|
|||
|
|
FieldSchema,
|
|||
|
|
CollectionSchema,
|
|||
|
|
DataType,
|
|||
|
|
utility
|
|||
|
|
)
|
|||
|
|
from typing import List, Dict, Optional, Any
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from loguru import logger
|
|||
|
|
import time
|
|||
|
|
import numpy as np
|
|||
|
|
|
|||
|
|
from ..embedding.text_chunker import TextChunk
|
|||
|
|
from ..embedding.bge_m3_embedder import EmbeddingResult
|
|||
|
|
from app.config.settings import settings
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class SearchResult:
|
|||
|
|
"""检索结果"""
|
|||
|
|
id: int
|
|||
|
|
content: str
|
|||
|
|
score: float
|
|||
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class MilvusDocument:
|
|||
|
|
"""Milvus文档数据结构"""
|
|||
|
|
doc_id: str
|
|||
|
|
chunk_id: str
|
|||
|
|
content: str
|
|||
|
|
dense_vector: List[float]
|
|||
|
|
sparse_vector: Dict[int, float]
|
|||
|
|
doc_name: str
|
|||
|
|
section_title: str
|
|||
|
|
clause_number: str
|
|||
|
|
page_number: int
|
|||
|
|
regulation_type: str
|
|||
|
|
version: str
|
|||
|
|
create_time: int
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MilvusClient:
|
|||
|
|
"""Milvus向量数据库客户端"""
|
|||
|
|
|
|||
|
|
COLLECTION_NAME = "regulations"
|
|||
|
|
|
|||
|
|
SCHEMA_FIELDS = [
|
|||
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
|||
|
|
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
|
|||
|
|
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
|
|||
|
|
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=8192),
|
|||
|
|
FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=1024),
|
|||
|
|
FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|||
|
|
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
|
|||
|
|
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
|||
|
|
FieldSchema(name="clause_number", dtype=DataType.VARCHAR, max_length=64),
|
|||
|
|
FieldSchema(name="page_number", dtype=DataType.INT64),
|
|||
|
|
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=32),
|
|||
|
|
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=32),
|
|||
|
|
FieldSchema(name="create_time", dtype=DataType.INT64),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
host: str = None,
|
|||
|
|
port: int = None,
|
|||
|
|
collection_name: str = None,
|
|||
|
|
db_name: str = None
|
|||
|
|
):
|
|||
|
|
self.host = host or settings.milvus_host
|
|||
|
|
self.port = port or settings.milvus_port
|
|||
|
|
self.collection_name = collection_name or settings.milvus_collection
|
|||
|
|
self.db_name = db_name or settings.milvus_db_name
|
|||
|
|
|
|||
|
|
self.collection: Optional[Collection] = None
|
|||
|
|
self.connected = False
|
|||
|
|
|
|||
|
|
logger.info(f"Milvus客户端配置: {self.host}:{self.port}, Collection: {self.collection_name}")
|
|||
|
|
|
|||
|
|
def connect(self) -> bool:
|
|||
|
|
"""连接到Milvus服务器"""
|
|||
|
|
try:
|
|||
|
|
connections.connect(
|
|||
|
|
alias="default",
|
|||
|
|
host=self.host,
|
|||
|
|
port=self.port,
|
|||
|
|
db_name=self.db_name
|
|||
|
|
)
|
|||
|
|
self.connected = True
|
|||
|
|
logger.success(f"Milvus连接成功: {self.host}:{self.port}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Milvus连接失败: {e}")
|
|||
|
|
self.connected = False
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def disconnect(self):
|
|||
|
|
"""断开连接"""
|
|||
|
|
try:
|
|||
|
|
connections.disconnect("default")
|
|||
|
|
self.connected = False
|
|||
|
|
logger.info("Milvus连接已断开")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"断开连接时出错: {e}")
|
|||
|
|
|
|||
|
|
def create_collection(self, recreate: bool = False) -> bool:
|
|||
|
|
"""创建Collection"""
|
|||
|
|
if not self.connected:
|
|||
|
|
logger.warning("未连接到Milvus,请先调用connect()")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
if utility.has_collection(self.collection_name):
|
|||
|
|
if recreate:
|
|||
|
|
logger.info(f"删除已存在的Collection: {self.collection_name}")
|
|||
|
|
utility.drop_collection(self.collection_name)
|
|||
|
|
else:
|
|||
|
|
logger.info(f"Collection已存在: {self.collection_name}")
|
|||
|
|
self.collection = Collection(self.collection_name)
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
schema = CollectionSchema(
|
|||
|
|
fields=self.SCHEMA_FIELDS,
|
|||
|
|
description="法规文档向量存储",
|
|||
|
|
enable_dynamic_field=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
self.collection = Collection(
|
|||
|
|
name=self.collection_name,
|
|||
|
|
schema=schema
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
self._create_indexes()
|
|||
|
|
|
|||
|
|
logger.success(f"Collection创建成功: {self.collection_name}")
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Collection创建失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def _create_indexes(self):
|
|||
|
|
"""创建向量索引"""
|
|||
|
|
if not self.collection:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
dense_index_params = {
|
|||
|
|
"metric_type": "COSINE",
|
|||
|
|
"index_type": "IVF_FLAT",
|
|||
|
|
"params": {"nlist": 128}
|
|||
|
|
}
|
|||
|
|
self.collection.create_index(
|
|||
|
|
field_name="dense_vector",
|
|||
|
|
index_params=dense_index_params
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
sparse_index_params = {
|
|||
|
|
"metric_type": "IP",
|
|||
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|||
|
|
"params": {"drop_ratio_build": 0.2}
|
|||
|
|
}
|
|||
|
|
self.collection.create_index(
|
|||
|
|
field_name="sparse_vector",
|
|||
|
|
index_params=sparse_index_params
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
logger.success("向量索引创建成功")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"创建索引时出错: {e}")
|
|||
|
|
|
|||
|
|
def load_collection(self):
|
|||
|
|
"""加载Collection到内存"""
|
|||
|
|
if self.collection:
|
|||
|
|
self.collection.load()
|
|||
|
|
logger.info(f"Collection已加载: {self.collection_name}")
|
|||
|
|
|
|||
|
|
def release_collection(self):
|
|||
|
|
"""释放Collection内存"""
|
|||
|
|
if self.collection:
|
|||
|
|
self.collection.release()
|
|||
|
|
logger.info(f"Collection已释放: {self.collection_name}")
|
|||
|
|
|
|||
|
|
def insert_chunks(
|
|||
|
|
self,
|
|||
|
|
chunks: List[TextChunk],
|
|||
|
|
embeddings: EmbeddingResult
|
|||
|
|
) -> List[int]:
|
|||
|
|
"""插入文档分块和嵌入向量"""
|
|||
|
|
if not self.collection:
|
|||
|
|
logger.warning("Collection未初始化")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
if len(chunks) != len(embeddings.texts):
|
|||
|
|
logger.warning(f"Chunks数量与嵌入数量不匹配")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
logger.info(f"准备插入{len(chunks)}个文档分块")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
data = []
|
|||
|
|
current_time = int(time.time())
|
|||
|
|
|
|||
|
|
for chunk, dense_emb, sparse_emb in zip(
|
|||
|
|
chunks,
|
|||
|
|
embeddings.dense_embeddings,
|
|||
|
|
embeddings.sparse_embeddings
|
|||
|
|
):
|
|||
|
|
row = {
|
|||
|
|
"doc_id": chunk.metadata.doc_id,
|
|||
|
|
"chunk_id": chunk.metadata.chunk_id,
|
|||
|
|
"content": chunk.content,
|
|||
|
|
"dense_vector": dense_emb.tolist(),
|
|||
|
|
"sparse_vector": sparse_emb,
|
|||
|
|
"doc_name": chunk.metadata.doc_name,
|
|||
|
|
"section_title": chunk.metadata.section_title,
|
|||
|
|
"clause_number": chunk.metadata.clause_number,
|
|||
|
|
"page_number": chunk.metadata.page_number,
|
|||
|
|
"regulation_type": chunk.metadata.regulation_type,
|
|||
|
|
"version": chunk.metadata.version,
|
|||
|
|
"create_time": current_time
|
|||
|
|
}
|
|||
|
|
data.append(row)
|
|||
|
|
|
|||
|
|
result = self.collection.insert(data)
|
|||
|
|
self.collection.flush()
|
|||
|
|
|
|||
|
|
logger.success(f"插入完成,共{len(result.primary_keys)}条记录")
|
|||
|
|
return result.primary_keys
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"插入数据失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def hybrid_search(
|
|||
|
|
self,
|
|||
|
|
query_dense: List[float],
|
|||
|
|
query_sparse: Dict[int, float],
|
|||
|
|
top_k: int = 10,
|
|||
|
|
filters: Optional[str] = None
|
|||
|
|
) -> List[SearchResult]:
|
|||
|
|
"""混合检索:Dense + Sparse"""
|
|||
|
|
if not self.collection:
|
|||
|
|
logger.warning("Collection未初始化")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
self.collection.load()
|
|||
|
|
|
|||
|
|
# 使用简单的Dense检索(兼容所有版本)
|
|||
|
|
dense_results = self.dense_search(query_dense, top_k, filters)
|
|||
|
|
|
|||
|
|
# 可选:合并Sparse结果
|
|||
|
|
if query_sparse:
|
|||
|
|
sparse_results = self.sparse_search(query_sparse, top_k, filters)
|
|||
|
|
merged = self._merge_results(dense_results, sparse_results, top_k)
|
|||
|
|
logger.success(f"混合检索完成,返回{len(merged)}条结果")
|
|||
|
|
return merged
|
|||
|
|
|
|||
|
|
return dense_results
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"混合检索失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def _merge_results(
|
|||
|
|
self,
|
|||
|
|
dense_results: List[SearchResult],
|
|||
|
|
sparse_results: List[SearchResult],
|
|||
|
|
top_k: int,
|
|||
|
|
dense_weight: float = 0.6
|
|||
|
|
) -> List[SearchResult]:
|
|||
|
|
"""手动融合Dense和Sparse结果"""
|
|||
|
|
sparse_weight = 1 - dense_weight
|
|||
|
|
merged_dict = {}
|
|||
|
|
|
|||
|
|
for r in dense_results:
|
|||
|
|
merged_dict[r.id] = {
|
|||
|
|
"result": r,
|
|||
|
|
"dense_score": r.score * dense_weight,
|
|||
|
|
"sparse_score": 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for r in sparse_results:
|
|||
|
|
if r.id in merged_dict:
|
|||
|
|
merged_dict[r.id]["sparse_score"] = r.score * sparse_weight
|
|||
|
|
else:
|
|||
|
|
merged_dict[r.id] = {
|
|||
|
|
"result": r,
|
|||
|
|
"dense_score": 0,
|
|||
|
|
"sparse_score": r.score * sparse_weight
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
final_results = []
|
|||
|
|
for id_, data in merged_dict.items():
|
|||
|
|
result = data["result"]
|
|||
|
|
final_score = data["dense_score"] + data["sparse_score"]
|
|||
|
|
final_results.append(SearchResult(
|
|||
|
|
id=result.id,
|
|||
|
|
content=result.content,
|
|||
|
|
score=final_score,
|
|||
|
|
metadata=result.metadata
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
final_results.sort(key=lambda x: x.score, reverse=True)
|
|||
|
|
return final_results[:top_k]
|
|||
|
|
|
|||
|
|
def dense_search(
|
|||
|
|
self,
|
|||
|
|
query_dense: List[float],
|
|||
|
|
top_k: int = 10,
|
|||
|
|
filters: Optional[str] = None
|
|||
|
|
) -> List[SearchResult]:
|
|||
|
|
"""纯Dense向量检索"""
|
|||
|
|
if not self.collection:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
self.collection.load()
|
|||
|
|
|
|||
|
|
search_params = {
|
|||
|
|
"metric_type": "COSINE",
|
|||
|
|
"params": {"nprobe": 16}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
results = self.collection.search(
|
|||
|
|
data=[query_dense],
|
|||
|
|
anns_field="dense_vector",
|
|||
|
|
param=search_params,
|
|||
|
|
limit=top_k,
|
|||
|
|
filter=filters,
|
|||
|
|
output_fields=[
|
|||
|
|
"doc_id", "chunk_id", "content",
|
|||
|
|
"doc_name", "section_title", "clause_number",
|
|||
|
|
"page_number", "regulation_type", "version"
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
search_results = []
|
|||
|
|
for hits in results:
|
|||
|
|
for hit in hits:
|
|||
|
|
result = SearchResult(
|
|||
|
|
id=hit.id,
|
|||
|
|
content=hit.entity.get("content", ""),
|
|||
|
|
score=hit.score,
|
|||
|
|
metadata={
|
|||
|
|
"doc_id": hit.entity.get("doc_id", ""),
|
|||
|
|
"chunk_id": hit.entity.get("chunk_id", ""),
|
|||
|
|
"doc_name": hit.entity.get("doc_name", ""),
|
|||
|
|
"section_title": hit.entity.get("section_title", ""),
|
|||
|
|
"clause_number": hit.entity.get("clause_number", ""),
|
|||
|
|
"page_number": hit.entity.get("page_number", 0),
|
|||
|
|
"regulation_type": hit.entity.get("regulation_type", ""),
|
|||
|
|
"version": hit.entity.get("version", ""),
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
search_results.append(result)
|
|||
|
|
|
|||
|
|
return search_results
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Dense检索失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def sparse_search(
|
|||
|
|
self,
|
|||
|
|
query_sparse: Dict[int, float],
|
|||
|
|
top_k: int = 10,
|
|||
|
|
filters: Optional[str] = None
|
|||
|
|
) -> List[SearchResult]:
|
|||
|
|
"""纯Sparse向量检索"""
|
|||
|
|
if not self.collection:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
self.collection.load()
|
|||
|
|
|
|||
|
|
search_params = {
|
|||
|
|
"metric_type": "IP",
|
|||
|
|
"params": {"drop_ratio_search": 0.2}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
results = self.collection.search(
|
|||
|
|
data=[query_sparse],
|
|||
|
|
anns_field="sparse_vector",
|
|||
|
|
param=search_params,
|
|||
|
|
limit=top_k,
|
|||
|
|
filter=filters,
|
|||
|
|
output_fields=[
|
|||
|
|
"doc_id", "chunk_id", "content",
|
|||
|
|
"doc_name", "section_title", "clause_number",
|
|||
|
|
"page_number", "regulation_type", "version"
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
search_results = []
|
|||
|
|
for hits in results:
|
|||
|
|
for hit in hits:
|
|||
|
|
result = SearchResult(
|
|||
|
|
id=hit.id,
|
|||
|
|
content=hit.entity.get("content", ""),
|
|||
|
|
score=hit.score,
|
|||
|
|
metadata={
|
|||
|
|
"doc_id": hit.entity.get("doc_id", ""),
|
|||
|
|
"chunk_id": hit.entity.get("chunk_id", ""),
|
|||
|
|
"doc_name": hit.entity.get("doc_name", ""),
|
|||
|
|
"section_title": hit.entity.get("section_title", ""),
|
|||
|
|
"clause_number": hit.entity.get("clause_number", ""),
|
|||
|
|
"page_number": hit.entity.get("page_number", 0),
|
|||
|
|
"regulation_type": hit.entity.get("regulation_type", ""),
|
|||
|
|
"version": hit.entity.get("version", ""),
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
search_results.append(result)
|
|||
|
|
|
|||
|
|
return search_results
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Sparse检索失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def delete_by_doc_id(self, doc_id: str) -> int:
|
|||
|
|
"""根据doc_id删除记录"""
|
|||
|
|
if not self.collection:
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
expr = f'doc_id=="{doc_id}"'
|
|||
|
|
result = self.collection.delete(expr)
|
|||
|
|
logger.info(f"删除记录: doc_id={doc_id}, 数量={len(result.primary_keys)}")
|
|||
|
|
return len(result.primary_keys)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"删除失败: {e}")
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
def get_collection_stats(self) -> Dict[str, Any]:
|
|||
|
|
"""获取Collection统计信息"""
|
|||
|
|
if not self.collection:
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
stats = {
|
|||
|
|
"name": self.collection_name,
|
|||
|
|
"num_entities": self.collection.num_entities,
|
|||
|
|
"description": self.collection.description,
|
|||
|
|
}
|
|||
|
|
return stats
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"获取统计信息失败: {e}")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def create_milvus_client() -> MilvusClient:
|
|||
|
|
"""便捷函数:创建Milvus客户端"""
|
|||
|
|
client = MilvusClient()
|
|||
|
|
client.connect()
|
|||
|
|
client.create_collection(recreate=False)
|
|||
|
|
return client
|
|||
|
|
|
|||
|
|
|
|||
|
|
def insert_documents(
|
|||
|
|
client: MilvusClient,
|
|||
|
|
chunks: List[TextChunk],
|
|||
|
|
embeddings: EmbeddingResult
|
|||
|
|
) -> List[int]:
|
|||
|
|
"""便捷函数:插入文档"""
|
|||
|
|
return client.insert_chunks(chunks, embeddings)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def search_regulations(
|
|||
|
|
client: MilvusClient,
|
|||
|
|
query_dense: List[float],
|
|||
|
|
query_sparse: Dict[int, float],
|
|||
|
|
top_k: int = 10
|
|||
|
|
) -> List[SearchResult]:
|
|||
|
|
"""便捷函数:检索法规"""
|
|||
|
|
return client.hybrid_search(query_dense, query_sparse, top_k)
|