update
This commit is contained in:
485
backend/app/services/storage/milvus_client.py
Normal file
485
backend/app/services/storage/milvus_client.py
Normal file
@@ -0,0 +1,485 @@
|
||||
# src/services/storage/milvus_client.py
|
||||
"""Milvus向量数据库客户端 - 存储与检索服务"""
|
||||
|
||||
from pymilvus import (
|
||||
connections,
|
||||
Collection,
|
||||
FieldSchema,
|
||||
CollectionSchema,
|
||||
DataType,
|
||||
utility
|
||||
)
|
||||
from typing import List, Dict, Optional, Any
|
||||
from dataclasses import dataclass, field
|
||||
from loguru import logger
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
from ..embedding.text_chunker import TextChunk
|
||||
from ..embedding.bge_m3_embedder import EmbeddingResult
|
||||
from app.config.settings import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""检索结果"""
|
||||
id: int
|
||||
content: str
|
||||
score: float
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MilvusDocument:
|
||||
"""Milvus文档数据结构"""
|
||||
doc_id: str
|
||||
chunk_id: str
|
||||
content: str
|
||||
dense_vector: List[float]
|
||||
sparse_vector: Dict[int, float]
|
||||
doc_name: str
|
||||
section_title: str
|
||||
clause_number: str
|
||||
page_number: int
|
||||
regulation_type: str
|
||||
version: str
|
||||
create_time: int
|
||||
|
||||
|
||||
class MilvusClient:
|
||||
"""Milvus向量数据库客户端"""
|
||||
|
||||
COLLECTION_NAME = "regulations"
|
||||
|
||||
SCHEMA_FIELDS = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="chunk_id", dtype=DataType.VARCHAR, max_length=128),
|
||||
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=8192),
|
||||
FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=1024),
|
||||
FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||
FieldSchema(name="doc_name", dtype=DataType.VARCHAR, max_length=256),
|
||||
FieldSchema(name="section_title", dtype=DataType.VARCHAR, max_length=512),
|
||||
FieldSchema(name="clause_number", dtype=DataType.VARCHAR, max_length=64),
|
||||
FieldSchema(name="page_number", dtype=DataType.INT64),
|
||||
FieldSchema(name="regulation_type", dtype=DataType.VARCHAR, max_length=32),
|
||||
FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=32),
|
||||
FieldSchema(name="create_time", dtype=DataType.INT64),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str = None,
|
||||
port: int = None,
|
||||
collection_name: str = None,
|
||||
db_name: str = None
|
||||
):
|
||||
self.host = host or settings.milvus_host
|
||||
self.port = port or settings.milvus_port
|
||||
self.collection_name = collection_name or settings.milvus_collection
|
||||
self.db_name = db_name or settings.milvus_db_name
|
||||
|
||||
self.collection: Optional[Collection] = None
|
||||
self.connected = False
|
||||
|
||||
logger.info(f"Milvus客户端配置: {self.host}:{self.port}, Collection: {self.collection_name}")
|
||||
|
||||
def connect(self) -> bool:
|
||||
"""连接到Milvus服务器"""
|
||||
try:
|
||||
connections.connect(
|
||||
alias="default",
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
db_name=self.db_name
|
||||
)
|
||||
self.connected = True
|
||||
logger.success(f"Milvus连接成功: {self.host}:{self.port}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Milvus连接失败: {e}")
|
||||
self.connected = False
|
||||
return False
|
||||
|
||||
def disconnect(self):
|
||||
"""断开连接"""
|
||||
try:
|
||||
connections.disconnect("default")
|
||||
self.connected = False
|
||||
logger.info("Milvus连接已断开")
|
||||
except Exception as e:
|
||||
logger.warning(f"断开连接时出错: {e}")
|
||||
|
||||
def create_collection(self, recreate: bool = False) -> bool:
|
||||
"""创建Collection"""
|
||||
if not self.connected:
|
||||
logger.warning("未连接到Milvus,请先调用connect()")
|
||||
return False
|
||||
|
||||
try:
|
||||
if utility.has_collection(self.collection_name):
|
||||
if recreate:
|
||||
logger.info(f"删除已存在的Collection: {self.collection_name}")
|
||||
utility.drop_collection(self.collection_name)
|
||||
else:
|
||||
logger.info(f"Collection已存在: {self.collection_name}")
|
||||
self.collection = Collection(self.collection_name)
|
||||
return True
|
||||
|
||||
schema = CollectionSchema(
|
||||
fields=self.SCHEMA_FIELDS,
|
||||
description="法规文档向量存储",
|
||||
enable_dynamic_field=True
|
||||
)
|
||||
|
||||
self.collection = Collection(
|
||||
name=self.collection_name,
|
||||
schema=schema
|
||||
)
|
||||
|
||||
self._create_indexes()
|
||||
|
||||
logger.success(f"Collection创建成功: {self.collection_name}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Collection创建失败: {e}")
|
||||
return False
|
||||
|
||||
def _create_indexes(self):
|
||||
"""创建向量索引"""
|
||||
if not self.collection:
|
||||
return
|
||||
|
||||
try:
|
||||
dense_index_params = {
|
||||
"metric_type": "COSINE",
|
||||
"index_type": "IVF_FLAT",
|
||||
"params": {"nlist": 128}
|
||||
}
|
||||
self.collection.create_index(
|
||||
field_name="dense_vector",
|
||||
index_params=dense_index_params
|
||||
)
|
||||
|
||||
sparse_index_params = {
|
||||
"metric_type": "IP",
|
||||
"index_type": "SPARSE_INVERTED_INDEX",
|
||||
"params": {"drop_ratio_build": 0.2}
|
||||
}
|
||||
self.collection.create_index(
|
||||
field_name="sparse_vector",
|
||||
index_params=sparse_index_params
|
||||
)
|
||||
|
||||
logger.success("向量索引创建成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"创建索引时出错: {e}")
|
||||
|
||||
def load_collection(self):
|
||||
"""加载Collection到内存"""
|
||||
if self.collection:
|
||||
self.collection.load()
|
||||
logger.info(f"Collection已加载: {self.collection_name}")
|
||||
|
||||
def release_collection(self):
|
||||
"""释放Collection内存"""
|
||||
if self.collection:
|
||||
self.collection.release()
|
||||
logger.info(f"Collection已释放: {self.collection_name}")
|
||||
|
||||
def insert_chunks(
|
||||
self,
|
||||
chunks: List[TextChunk],
|
||||
embeddings: EmbeddingResult
|
||||
) -> List[int]:
|
||||
"""插入文档分块和嵌入向量"""
|
||||
if not self.collection:
|
||||
logger.warning("Collection未初始化")
|
||||
return []
|
||||
|
||||
if len(chunks) != len(embeddings.texts):
|
||||
logger.warning(f"Chunks数量与嵌入数量不匹配")
|
||||
return []
|
||||
|
||||
logger.info(f"准备插入{len(chunks)}个文档分块")
|
||||
|
||||
try:
|
||||
data = []
|
||||
current_time = int(time.time())
|
||||
|
||||
for chunk, dense_emb, sparse_emb in zip(
|
||||
chunks,
|
||||
embeddings.dense_embeddings,
|
||||
embeddings.sparse_embeddings
|
||||
):
|
||||
row = {
|
||||
"doc_id": chunk.metadata.doc_id,
|
||||
"chunk_id": chunk.metadata.chunk_id,
|
||||
"content": chunk.content,
|
||||
"dense_vector": dense_emb.tolist(),
|
||||
"sparse_vector": sparse_emb,
|
||||
"doc_name": chunk.metadata.doc_name,
|
||||
"section_title": chunk.metadata.section_title,
|
||||
"clause_number": chunk.metadata.clause_number,
|
||||
"page_number": chunk.metadata.page_number,
|
||||
"regulation_type": chunk.metadata.regulation_type,
|
||||
"version": chunk.metadata.version,
|
||||
"create_time": current_time
|
||||
}
|
||||
data.append(row)
|
||||
|
||||
result = self.collection.insert(data)
|
||||
self.collection.flush()
|
||||
|
||||
logger.success(f"插入完成,共{len(result.primary_keys)}条记录")
|
||||
return result.primary_keys
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"插入数据失败: {e}")
|
||||
return []
|
||||
|
||||
def hybrid_search(
|
||||
self,
|
||||
query_dense: List[float],
|
||||
query_sparse: Dict[int, float],
|
||||
top_k: int = 10,
|
||||
filters: Optional[str] = None
|
||||
) -> List[SearchResult]:
|
||||
"""混合检索:Dense + Sparse"""
|
||||
if not self.collection:
|
||||
logger.warning("Collection未初始化")
|
||||
return []
|
||||
|
||||
try:
|
||||
self.collection.load()
|
||||
|
||||
# 使用简单的Dense检索(兼容所有版本)
|
||||
dense_results = self.dense_search(query_dense, top_k, filters)
|
||||
|
||||
# 可选:合并Sparse结果
|
||||
if query_sparse:
|
||||
sparse_results = self.sparse_search(query_sparse, top_k, filters)
|
||||
merged = self._merge_results(dense_results, sparse_results, top_k)
|
||||
logger.success(f"混合检索完成,返回{len(merged)}条结果")
|
||||
return merged
|
||||
|
||||
return dense_results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"混合检索失败: {e}")
|
||||
return []
|
||||
|
||||
def _merge_results(
|
||||
self,
|
||||
dense_results: List[SearchResult],
|
||||
sparse_results: List[SearchResult],
|
||||
top_k: int,
|
||||
dense_weight: float = 0.6
|
||||
) -> List[SearchResult]:
|
||||
"""手动融合Dense和Sparse结果"""
|
||||
sparse_weight = 1 - dense_weight
|
||||
merged_dict = {}
|
||||
|
||||
for r in dense_results:
|
||||
merged_dict[r.id] = {
|
||||
"result": r,
|
||||
"dense_score": r.score * dense_weight,
|
||||
"sparse_score": 0
|
||||
}
|
||||
|
||||
for r in sparse_results:
|
||||
if r.id in merged_dict:
|
||||
merged_dict[r.id]["sparse_score"] = r.score * sparse_weight
|
||||
else:
|
||||
merged_dict[r.id] = {
|
||||
"result": r,
|
||||
"dense_score": 0,
|
||||
"sparse_score": r.score * sparse_weight
|
||||
}
|
||||
|
||||
final_results = []
|
||||
for id_, data in merged_dict.items():
|
||||
result = data["result"]
|
||||
final_score = data["dense_score"] + data["sparse_score"]
|
||||
final_results.append(SearchResult(
|
||||
id=result.id,
|
||||
content=result.content,
|
||||
score=final_score,
|
||||
metadata=result.metadata
|
||||
))
|
||||
|
||||
final_results.sort(key=lambda x: x.score, reverse=True)
|
||||
return final_results[:top_k]
|
||||
|
||||
def dense_search(
|
||||
self,
|
||||
query_dense: List[float],
|
||||
top_k: int = 10,
|
||||
filters: Optional[str] = None
|
||||
) -> List[SearchResult]:
|
||||
"""纯Dense向量检索"""
|
||||
if not self.collection:
|
||||
return []
|
||||
|
||||
try:
|
||||
self.collection.load()
|
||||
|
||||
search_params = {
|
||||
"metric_type": "COSINE",
|
||||
"params": {"nprobe": 16}
|
||||
}
|
||||
|
||||
results = self.collection.search(
|
||||
data=[query_dense],
|
||||
anns_field="dense_vector",
|
||||
param=search_params,
|
||||
limit=top_k,
|
||||
filter=filters,
|
||||
output_fields=[
|
||||
"doc_id", "chunk_id", "content",
|
||||
"doc_name", "section_title", "clause_number",
|
||||
"page_number", "regulation_type", "version"
|
||||
]
|
||||
)
|
||||
|
||||
search_results = []
|
||||
for hits in results:
|
||||
for hit in hits:
|
||||
result = SearchResult(
|
||||
id=hit.id,
|
||||
content=hit.entity.get("content", ""),
|
||||
score=hit.score,
|
||||
metadata={
|
||||
"doc_id": hit.entity.get("doc_id", ""),
|
||||
"chunk_id": hit.entity.get("chunk_id", ""),
|
||||
"doc_name": hit.entity.get("doc_name", ""),
|
||||
"section_title": hit.entity.get("section_title", ""),
|
||||
"clause_number": hit.entity.get("clause_number", ""),
|
||||
"page_number": hit.entity.get("page_number", 0),
|
||||
"regulation_type": hit.entity.get("regulation_type", ""),
|
||||
"version": hit.entity.get("version", ""),
|
||||
}
|
||||
)
|
||||
search_results.append(result)
|
||||
|
||||
return search_results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Dense检索失败: {e}")
|
||||
return []
|
||||
|
||||
def sparse_search(
|
||||
self,
|
||||
query_sparse: Dict[int, float],
|
||||
top_k: int = 10,
|
||||
filters: Optional[str] = None
|
||||
) -> List[SearchResult]:
|
||||
"""纯Sparse向量检索"""
|
||||
if not self.collection:
|
||||
return []
|
||||
|
||||
try:
|
||||
self.collection.load()
|
||||
|
||||
search_params = {
|
||||
"metric_type": "IP",
|
||||
"params": {"drop_ratio_search": 0.2}
|
||||
}
|
||||
|
||||
results = self.collection.search(
|
||||
data=[query_sparse],
|
||||
anns_field="sparse_vector",
|
||||
param=search_params,
|
||||
limit=top_k,
|
||||
filter=filters,
|
||||
output_fields=[
|
||||
"doc_id", "chunk_id", "content",
|
||||
"doc_name", "section_title", "clause_number",
|
||||
"page_number", "regulation_type", "version"
|
||||
]
|
||||
)
|
||||
|
||||
search_results = []
|
||||
for hits in results:
|
||||
for hit in hits:
|
||||
result = SearchResult(
|
||||
id=hit.id,
|
||||
content=hit.entity.get("content", ""),
|
||||
score=hit.score,
|
||||
metadata={
|
||||
"doc_id": hit.entity.get("doc_id", ""),
|
||||
"chunk_id": hit.entity.get("chunk_id", ""),
|
||||
"doc_name": hit.entity.get("doc_name", ""),
|
||||
"section_title": hit.entity.get("section_title", ""),
|
||||
"clause_number": hit.entity.get("clause_number", ""),
|
||||
"page_number": hit.entity.get("page_number", 0),
|
||||
"regulation_type": hit.entity.get("regulation_type", ""),
|
||||
"version": hit.entity.get("version", ""),
|
||||
}
|
||||
)
|
||||
search_results.append(result)
|
||||
|
||||
return search_results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Sparse检索失败: {e}")
|
||||
return []
|
||||
|
||||
def delete_by_doc_id(self, doc_id: str) -> int:
|
||||
"""根据doc_id删除记录"""
|
||||
if not self.collection:
|
||||
return 0
|
||||
|
||||
try:
|
||||
expr = f'doc_id=="{doc_id}"'
|
||||
result = self.collection.delete(expr)
|
||||
logger.info(f"删除记录: doc_id={doc_id}, 数量={len(result.primary_keys)}")
|
||||
return len(result.primary_keys)
|
||||
except Exception as e:
|
||||
logger.error(f"删除失败: {e}")
|
||||
return 0
|
||||
|
||||
def get_collection_stats(self) -> Dict[str, Any]:
|
||||
"""获取Collection统计信息"""
|
||||
if not self.collection:
|
||||
return {}
|
||||
|
||||
try:
|
||||
stats = {
|
||||
"name": self.collection_name,
|
||||
"num_entities": self.collection.num_entities,
|
||||
"description": self.collection.description,
|
||||
}
|
||||
return stats
|
||||
except Exception as e:
|
||||
logger.warning(f"获取统计信息失败: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def create_milvus_client() -> MilvusClient:
|
||||
"""便捷函数:创建Milvus客户端"""
|
||||
client = MilvusClient()
|
||||
client.connect()
|
||||
client.create_collection(recreate=False)
|
||||
return client
|
||||
|
||||
|
||||
def insert_documents(
|
||||
client: MilvusClient,
|
||||
chunks: List[TextChunk],
|
||||
embeddings: EmbeddingResult
|
||||
) -> List[int]:
|
||||
"""便捷函数:插入文档"""
|
||||
return client.insert_chunks(chunks, embeddings)
|
||||
|
||||
|
||||
def search_regulations(
|
||||
client: MilvusClient,
|
||||
query_dense: List[float],
|
||||
query_sparse: Dict[int, float],
|
||||
top_k: int = 10
|
||||
) -> List[SearchResult]:
|
||||
"""便捷函数:检索法规"""
|
||||
return client.hybrid_search(query_dense, query_sparse, top_k)
|
||||
Reference in New Issue
Block a user