将flask改成fastapi
This commit is contained in:
130
rag/utils/__init__.py
Normal file
130
rag/utils/__init__.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import tiktoken
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
|
||||
|
||||
def singleton(cls, *args, **kw):
|
||||
instances = {}
|
||||
|
||||
def _singleton():
|
||||
key = str(cls) + str(os.getpid())
|
||||
if key not in instances:
|
||||
instances[key] = cls(*args, **kw)
|
||||
return instances[key]
|
||||
|
||||
return _singleton
|
||||
|
||||
|
||||
def rmSpace(txt):
|
||||
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def findMaxDt(fnm):
|
||||
m = "1970-01-01 00:00:00"
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n")
|
||||
if line == 'nan':
|
||||
continue
|
||||
if line > m:
|
||||
m = line
|
||||
except Exception:
|
||||
pass
|
||||
return m
|
||||
|
||||
|
||||
def findMaxTm(fnm):
|
||||
m = 0
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n")
|
||||
if line == 'nan':
|
||||
continue
|
||||
if int(line) > m:
|
||||
m = int(line)
|
||||
except Exception:
|
||||
pass
|
||||
return m
|
||||
|
||||
|
||||
tiktoken_cache_dir = get_project_base_directory()
|
||||
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
|
||||
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
||||
encoder = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str) -> int:
|
||||
"""Returns the number of tokens in a text string."""
|
||||
try:
|
||||
return len(encoder.encode(string))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
def total_token_count_from_response(resp):
|
||||
if hasattr(resp, "usage") and hasattr(resp.usage, "total_tokens"):
|
||||
try:
|
||||
return resp.usage.total_tokens
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if hasattr(resp, "usage_metadata") and hasattr(resp.usage_metadata, "total_tokens"):
|
||||
try:
|
||||
return resp.usage_metadata.total_tokens
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if 'usage' in resp and 'total_tokens' in resp['usage']:
|
||||
try:
|
||||
return resp["usage"]["total_tokens"]
|
||||
except Exception:
|
||||
pass
|
||||
return 0
|
||||
|
||||
|
||||
def truncate(string: str, max_len: int) -> str:
|
||||
"""Returns truncated text if the length of text exceed max_len."""
|
||||
return encoder.decode(encoder.encode(string)[:max_len])
|
||||
|
||||
|
||||
def clean_markdown_block(text):
|
||||
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
|
||||
text = re.sub(r'\n?\s*```\s*$', '', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_float(v):
|
||||
if v is None:
|
||||
return float('-inf')
|
||||
try:
|
||||
return float(v)
|
||||
except Exception:
|
||||
return float('-inf')
|
||||
|
||||
95
rag/utils/azure_sas_conn.py
Normal file
95
rag/utils/azure_sas_conn.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from io import BytesIO
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
from azure.storage.blob import ContainerClient
|
||||
|
||||
|
||||
@singleton
|
||||
class RAGFlowAzureSasBlob:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.container_url = os.getenv('CONTAINER_URL', settings.AZURE["container_url"])
|
||||
self.sas_token = os.getenv('SAS_TOKEN', settings.AZURE["sas_token"])
|
||||
self.__open__()
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
self.conn = ContainerClient.from_container_url(self.container_url + "?" + self.sas_token)
|
||||
except Exception:
|
||||
logging.exception("Fail to connect %s " % self.container_url)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
self.conn = None
|
||||
|
||||
def health(self):
|
||||
_bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
|
||||
return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
for _ in range(3):
|
||||
try:
|
||||
return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
|
||||
except Exception:
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.delete_blob(fnm)
|
||||
except Exception:
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
try:
|
||||
r = self.conn.download_blob(fnm)
|
||||
return r.read()
|
||||
except Exception:
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
def obj_exist(self, bucket, fnm):
|
||||
try:
|
||||
return self.conn.get_blob_client(fnm).exists()
|
||||
except Exception:
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception:
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
105
rag/utils/azure_spn_conn.py
Normal file
105
rag/utils/azure_spn_conn.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
from azure.identity import ClientSecretCredential, AzureAuthorityHosts
|
||||
from azure.storage.filedatalake import FileSystemClient
|
||||
|
||||
|
||||
@singleton
|
||||
class RAGFlowAzureSpnBlob:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.account_url = os.getenv('ACCOUNT_URL', settings.AZURE["account_url"])
|
||||
self.client_id = os.getenv('CLIENT_ID', settings.AZURE["client_id"])
|
||||
self.secret = os.getenv('SECRET', settings.AZURE["secret"])
|
||||
self.tenant_id = os.getenv('TENANT_ID', settings.AZURE["tenant_id"])
|
||||
self.container_name = os.getenv('CONTAINER_NAME', settings.AZURE["container_name"])
|
||||
self.__open__()
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA)
|
||||
self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials)
|
||||
except Exception:
|
||||
logging.exception("Fail to connect %s" % self.account_url)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
self.conn = None
|
||||
|
||||
def health(self):
|
||||
_bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
|
||||
f = self.conn.create_file(fnm)
|
||||
f.append_data(binary, offset=0, length=len(binary))
|
||||
return f.flush_data(len(binary))
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
for _ in range(3):
|
||||
try:
|
||||
f = self.conn.create_file(fnm)
|
||||
f.append_data(binary, offset=0, length=len(binary))
|
||||
return f.flush_data(len(binary))
|
||||
except Exception:
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.delete_file(fnm)
|
||||
except Exception:
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
try:
|
||||
client = self.conn.get_file_client(fnm)
|
||||
r = client.download_file()
|
||||
return r.read()
|
||||
except Exception:
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
def obj_exist(self, bucket, fnm):
|
||||
try:
|
||||
client = self.conn.get_file_client(fnm)
|
||||
return client.exists()
|
||||
except Exception:
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception:
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
271
rag/utils/doc_store_conn.py
Normal file
271
rag/utils/doc_store_conn.py
Normal file
@@ -0,0 +1,271 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
|
||||
DEFAULT_MATCH_VECTOR_TOPN = 10
|
||||
DEFAULT_MATCH_SPARSE_TOPN = 10
|
||||
VEC = list | np.ndarray
|
||||
|
||||
|
||||
@dataclass
|
||||
class SparseVector:
|
||||
indices: list[int]
|
||||
values: list[float] | list[int] | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
assert (self.values is None) or (len(self.indices) == len(self.values))
|
||||
|
||||
def to_dict_old(self):
|
||||
d = {"indices": self.indices}
|
||||
if self.values is not None:
|
||||
d["values"] = self.values
|
||||
return d
|
||||
|
||||
def to_dict(self):
|
||||
if self.values is None:
|
||||
raise ValueError("SparseVector.values is None")
|
||||
result = {}
|
||||
for i, v in zip(self.indices, self.values):
|
||||
result[str(i)] = v
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def from_dict(d):
|
||||
return SparseVector(d["indices"], d.get("values"))
|
||||
|
||||
def __str__(self):
|
||||
return f"SparseVector(indices={self.indices}{'' if self.values is None else f', values={self.values}'})"
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
class MatchTextExpr(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
fields: list[str],
|
||||
matching_text: str,
|
||||
topn: int,
|
||||
extra_options: dict = dict(),
|
||||
):
|
||||
self.fields = fields
|
||||
self.matching_text = matching_text
|
||||
self.topn = topn
|
||||
self.extra_options = extra_options
|
||||
|
||||
|
||||
class MatchDenseExpr(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
vector_column_name: str,
|
||||
embedding_data: VEC,
|
||||
embedding_data_type: str,
|
||||
distance_type: str,
|
||||
topn: int = DEFAULT_MATCH_VECTOR_TOPN,
|
||||
extra_options: dict = dict(),
|
||||
):
|
||||
self.vector_column_name = vector_column_name
|
||||
self.embedding_data = embedding_data
|
||||
self.embedding_data_type = embedding_data_type
|
||||
self.distance_type = distance_type
|
||||
self.topn = topn
|
||||
self.extra_options = extra_options
|
||||
|
||||
|
||||
class MatchSparseExpr(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
vector_column_name: str,
|
||||
sparse_data: SparseVector | dict,
|
||||
distance_type: str,
|
||||
topn: int,
|
||||
opt_params: dict | None = None,
|
||||
):
|
||||
self.vector_column_name = vector_column_name
|
||||
self.sparse_data = sparse_data
|
||||
self.distance_type = distance_type
|
||||
self.topn = topn
|
||||
self.opt_params = opt_params
|
||||
|
||||
|
||||
class MatchTensorExpr(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
column_name: str,
|
||||
query_data: VEC,
|
||||
query_data_type: str,
|
||||
topn: int,
|
||||
extra_option: dict | None = None,
|
||||
):
|
||||
self.column_name = column_name
|
||||
self.query_data = query_data
|
||||
self.query_data_type = query_data_type
|
||||
self.topn = topn
|
||||
self.extra_option = extra_option
|
||||
|
||||
|
||||
class FusionExpr(ABC):
|
||||
def __init__(self, method: str, topn: int, fusion_params: dict | None = None):
|
||||
self.method = method
|
||||
self.topn = topn
|
||||
self.fusion_params = fusion_params
|
||||
|
||||
|
||||
MatchExpr = MatchTextExpr | MatchDenseExpr | MatchSparseExpr | MatchTensorExpr | FusionExpr
|
||||
|
||||
class OrderByExpr(ABC):
|
||||
def __init__(self):
|
||||
self.fields = list()
|
||||
def asc(self, field: str):
|
||||
self.fields.append((field, 0))
|
||||
return self
|
||||
def desc(self, field: str):
|
||||
self.fields.append((field, 1))
|
||||
return self
|
||||
def fields(self):
|
||||
return self.fields
|
||||
|
||||
class DocStoreConnection(ABC):
|
||||
"""
|
||||
Database operations
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def dbType(self) -> str:
|
||||
"""
|
||||
Return the type of the database.
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def health(self) -> dict:
|
||||
"""
|
||||
Return the health status of the database.
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
"""
|
||||
Table operations
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
|
||||
"""
|
||||
Create an index with given name
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
||||
"""
|
||||
Delete an index with given name
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
|
||||
"""
|
||||
Check if an index with given name exists
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
"""
|
||||
CRUD operations
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def search(
|
||||
self, selectFields: list[str],
|
||||
highlightFields: list[str],
|
||||
condition: dict,
|
||||
matchExprs: list[MatchExpr],
|
||||
orderBy: OrderByExpr,
|
||||
offset: int,
|
||||
limit: int,
|
||||
indexNames: str|list[str],
|
||||
knowledgebaseIds: list[str],
|
||||
aggFields: list[str] = [],
|
||||
rank_feature: dict | None = None
|
||||
):
|
||||
"""
|
||||
Search with given conjunctive equivalent filtering condition and return all fields of matched documents
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
||||
"""
|
||||
Get single chunk with given id
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def insert(self, rows: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
|
||||
"""
|
||||
Update or insert a bulk of rows
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def update(self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str) -> bool:
|
||||
"""
|
||||
Update rows with given conjunctive equivalent filtering condition
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
|
||||
"""
|
||||
Delete rows with given conjunctive equivalent filtering condition
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
"""
|
||||
Helper functions for search result
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def getTotal(self, res):
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def getChunkIds(self, res):
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def getFields(self, res, fields: list[str]) -> dict[str, dict]:
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def getHighlight(self, res, keywords: list[str], fieldnm: str):
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
@abstractmethod
|
||||
def getAggregation(self, res, fieldnm: str):
|
||||
raise NotImplementedError("Not implemented")
|
||||
|
||||
"""
|
||||
SQL
|
||||
"""
|
||||
@abstractmethod
|
||||
def sql(sql: str, fetch_size: int, format: str):
|
||||
"""
|
||||
Run the sql generated by text-to-sql
|
||||
"""
|
||||
raise NotImplementedError("Not implemented")
|
||||
631
rag/utils/es_conn.py
Normal file
631
rag/utils/es_conn.py
Normal file
@@ -0,0 +1,631 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
|
||||
import copy
|
||||
from elasticsearch import Elasticsearch, NotFoundError
|
||||
from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from rag import settings
|
||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||
from rag.utils import singleton, get_float
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.common import convert_bytes
|
||||
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
|
||||
FusionExpr
|
||||
from rag.nlp import is_english, rag_tokenizer
|
||||
|
||||
ATTEMPT_TIME = 2
|
||||
|
||||
logger = logging.getLogger('ragflow.es_conn')
|
||||
|
||||
|
||||
@singleton
|
||||
class ESConnection(DocStoreConnection):
|
||||
def __init__(self):
|
||||
self.info = {}
|
||||
logger.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
if self._connect():
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.")
|
||||
time.sleep(5)
|
||||
|
||||
if not self.es.ping():
|
||||
msg = f"Elasticsearch {settings.ES['hosts']} is unhealthy in 120s."
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
v = self.info.get("version", {"number": "8.11.3"})
|
||||
v = v["number"].split(".")[0]
|
||||
if int(v) < 8:
|
||||
msg = f"Elasticsearch version must be greater than or equal to 8, current version: {v}"
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
msg = f"Elasticsearch mapping file not found at {fp_mapping}"
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
self.mapping = json.load(open(fp_mapping, "r"))
|
||||
logger.info(f"Elasticsearch {settings.ES['hosts']} is healthy.")
|
||||
|
||||
def _connect(self):
|
||||
self.es = Elasticsearch(
|
||||
settings.ES["hosts"].split(","),
|
||||
basic_auth=(settings.ES["username"], settings.ES[
|
||||
"password"]) if "username" in settings.ES and "password" in settings.ES else None,
|
||||
verify_certs=False,
|
||||
timeout=600
|
||||
)
|
||||
if self.es:
|
||||
self.info = self.es.info()
|
||||
return True
|
||||
return False
|
||||
|
||||
"""
|
||||
Database operations
|
||||
"""
|
||||
|
||||
def dbType(self) -> str:
|
||||
return "elasticsearch"
|
||||
|
||||
def health(self) -> dict:
|
||||
health_dict = dict(self.es.cluster.health())
|
||||
health_dict["type"] = "elasticsearch"
|
||||
return health_dict
|
||||
|
||||
"""
|
||||
Table operations
|
||||
"""
|
||||
|
||||
def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
|
||||
if self.indexExist(indexName, knowledgebaseId):
|
||||
return True
|
||||
try:
|
||||
from elasticsearch.client import IndicesClient
|
||||
return IndicesClient(self.es).create(index=indexName,
|
||||
settings=self.mapping["settings"],
|
||||
mappings=self.mapping["mappings"])
|
||||
except Exception:
|
||||
logger.exception("ESConnection.createIndex error %s" % (indexName))
|
||||
|
||||
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
||||
if len(knowledgebaseId) > 0:
|
||||
# The index need to be alive after any kb deletion since all kb under this tenant are in one index.
|
||||
return
|
||||
try:
|
||||
self.es.indices.delete(index=indexName, allow_no_indices=True)
|
||||
except NotFoundError:
|
||||
pass
|
||||
except Exception:
|
||||
logger.exception("ESConnection.deleteIdx error %s" % (indexName))
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str = None) -> bool:
|
||||
s = Index(indexName, self.es)
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
return s.exists()
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ES request timeout")
|
||||
time.sleep(3)
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
break
|
||||
return False
|
||||
|
||||
"""
|
||||
CRUD operations
|
||||
"""
|
||||
|
||||
def search(
|
||||
self, selectFields: list[str],
|
||||
highlightFields: list[str],
|
||||
condition: dict,
|
||||
matchExprs: list[MatchExpr],
|
||||
orderBy: OrderByExpr,
|
||||
offset: int,
|
||||
limit: int,
|
||||
indexNames: str | list[str],
|
||||
knowledgebaseIds: list[str],
|
||||
aggFields: list[str] = [],
|
||||
rank_feature: dict | None = None
|
||||
):
|
||||
"""
|
||||
Refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html
|
||||
"""
|
||||
if isinstance(indexNames, str):
|
||||
indexNames = indexNames.split(",")
|
||||
assert isinstance(indexNames, list) and len(indexNames) > 0
|
||||
assert "_id" not in condition
|
||||
|
||||
bqry = Q("bool", must=[])
|
||||
condition["kb_id"] = knowledgebaseIds
|
||||
for k, v in condition.items():
|
||||
if k == "available_int":
|
||||
if v == 0:
|
||||
bqry.filter.append(Q("range", available_int={"lt": 1}))
|
||||
else:
|
||||
bqry.filter.append(
|
||||
Q("bool", must_not=Q("range", available_int={"lt": 1})))
|
||||
continue
|
||||
if not v:
|
||||
continue
|
||||
if isinstance(v, list):
|
||||
bqry.filter.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
bqry.filter.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception(
|
||||
f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
|
||||
|
||||
s = Search()
|
||||
vector_similarity_weight = 0.5
|
||||
for m in matchExprs:
|
||||
if isinstance(m, FusionExpr) and m.method == "weighted_sum" and "weights" in m.fusion_params:
|
||||
assert len(matchExprs) == 3 and isinstance(matchExprs[0], MatchTextExpr) and isinstance(matchExprs[1],
|
||||
MatchDenseExpr) and isinstance(
|
||||
matchExprs[2], FusionExpr)
|
||||
weights = m.fusion_params["weights"]
|
||||
vector_similarity_weight = get_float(weights.split(",")[1])
|
||||
for m in matchExprs:
|
||||
if isinstance(m, MatchTextExpr):
|
||||
minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)
|
||||
if isinstance(minimum_should_match, float):
|
||||
minimum_should_match = str(int(minimum_should_match * 100)) + "%"
|
||||
bqry.must.append(Q("query_string", fields=m.fields,
|
||||
type="best_fields", query=m.matching_text,
|
||||
minimum_should_match=minimum_should_match,
|
||||
boost=1))
|
||||
bqry.boost = 1.0 - vector_similarity_weight
|
||||
|
||||
elif isinstance(m, MatchDenseExpr):
|
||||
assert (bqry is not None)
|
||||
similarity = 0.0
|
||||
if "similarity" in m.extra_options:
|
||||
similarity = m.extra_options["similarity"]
|
||||
s = s.knn(m.vector_column_name,
|
||||
m.topn,
|
||||
m.topn * 2,
|
||||
query_vector=list(m.embedding_data),
|
||||
filter=bqry.to_dict(),
|
||||
similarity=similarity,
|
||||
)
|
||||
|
||||
if bqry and rank_feature:
|
||||
for fld, sc in rank_feature.items():
|
||||
if fld != PAGERANK_FLD:
|
||||
fld = f"{TAG_FLD}.{fld}"
|
||||
bqry.should.append(Q("rank_feature", field=fld, linear={}, boost=sc))
|
||||
|
||||
if bqry:
|
||||
s = s.query(bqry)
|
||||
for field in highlightFields:
|
||||
s = s.highlight(field)
|
||||
|
||||
if orderBy:
|
||||
orders = list()
|
||||
for field, order in orderBy.fields:
|
||||
order = "asc" if order == 0 else "desc"
|
||||
if field in ["page_num_int", "top_int"]:
|
||||
order_info = {"order": order, "unmapped_type": "float",
|
||||
"mode": "avg", "numeric_type": "double"}
|
||||
elif field.endswith("_int") or field.endswith("_flt"):
|
||||
order_info = {"order": order, "unmapped_type": "float"}
|
||||
else:
|
||||
order_info = {"order": order, "unmapped_type": "text"}
|
||||
orders.append({field: order_info})
|
||||
s = s.sort(*orders)
|
||||
|
||||
for fld in aggFields:
|
||||
s.aggs.bucket(f'aggs_{fld}', 'terms', field=fld, size=1000000)
|
||||
|
||||
if limit > 0:
|
||||
s = s[offset:offset + limit]
|
||||
q = s.to_dict()
|
||||
logger.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))
|
||||
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
#print(json.dumps(q, ensure_ascii=False))
|
||||
res = self.es.search(index=indexNames,
|
||||
body=q,
|
||||
timeout="600s",
|
||||
# search_type="dfs_query_then_fetch",
|
||||
track_total_hits=True,
|
||||
_source=True)
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("Es Timeout.")
|
||||
logger.debug(f"ESConnection.search {str(indexNames)} res: " + str(res))
|
||||
return res
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ES request timeout")
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.exception(f"ESConnection.search {str(indexNames)} query: " + str(q) + str(e))
|
||||
raise e
|
||||
|
||||
logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!")
|
||||
raise Exception("ESConnection.search timeout.")
|
||||
|
||||
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = self.es.get(index=(indexName),
|
||||
id=chunkId, source=True, )
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("Es Timeout.")
|
||||
chunk = res["_source"]
|
||||
chunk["id"] = chunkId
|
||||
return chunk
|
||||
except NotFoundError:
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.exception(f"ESConnection.get({chunkId}) got exception")
|
||||
raise e
|
||||
logger.error(f"ESConnection.get timeout for {ATTEMPT_TIME} times!")
|
||||
raise Exception("ESConnection.get timeout.")
|
||||
|
||||
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
|
||||
# Refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
|
||||
operations = []
|
||||
for d in documents:
|
||||
assert "_id" not in d
|
||||
assert "id" in d
|
||||
d_copy = copy.deepcopy(d)
|
||||
d_copy["kb_id"] = knowledgebaseId
|
||||
meta_id = d_copy.pop("id", "")
|
||||
operations.append(
|
||||
{"index": {"_index": indexName, "_id": meta_id}})
|
||||
operations.append(d_copy)
|
||||
|
||||
res = []
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = []
|
||||
r = self.es.bulk(index=(indexName), operations=operations,
|
||||
refresh=False, timeout="60s")
|
||||
if re.search(r"False", str(r["errors"]), re.IGNORECASE):
|
||||
return res
|
||||
|
||||
for item in r["items"]:
|
||||
for action in ["create", "delete", "index", "update"]:
|
||||
if action in item and "error" in item[action]:
|
||||
res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
|
||||
return res
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ES request timeout")
|
||||
time.sleep(3)
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
res.append(str(e))
|
||||
logger.warning("ESConnection.insert got exception: " + str(e))
|
||||
|
||||
return res
|
||||
|
||||
def update(self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str) -> bool:
|
||||
doc = copy.deepcopy(newValue)
|
||||
doc.pop("id", None)
|
||||
condition["kb_id"] = knowledgebaseId
|
||||
if "id" in condition and isinstance(condition["id"], str):
|
||||
# update specific single document
|
||||
chunkId = condition["id"]
|
||||
for i in range(ATTEMPT_TIME):
|
||||
for k in doc.keys():
|
||||
if "feas" != k.split("_")[-1]:
|
||||
continue
|
||||
try:
|
||||
self.es.update(index=indexName, id=chunkId, script=f"ctx._source.remove(\"{k}\");")
|
||||
except Exception:
|
||||
logger.exception(f"ESConnection.update(index={indexName}, id={chunkId}, doc={json.dumps(condition, ensure_ascii=False)}) got exception")
|
||||
try:
|
||||
self.es.update(index=indexName, id=chunkId, doc=doc)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"ESConnection.update(index={indexName}, id={chunkId}, doc={json.dumps(condition, ensure_ascii=False)}) got exception: "+str(e))
|
||||
break
|
||||
return False
|
||||
|
||||
# update unspecific maybe-multiple documents
|
||||
bqry = Q("bool")
|
||||
for k, v in condition.items():
|
||||
if not isinstance(k, str) or not v:
|
||||
continue
|
||||
if k == "exists":
|
||||
bqry.filter.append(Q("exists", field=v))
|
||||
continue
|
||||
if isinstance(v, list):
|
||||
bqry.filter.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
bqry.filter.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception(
|
||||
f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
|
||||
scripts = []
|
||||
params = {}
|
||||
for k, v in newValue.items():
|
||||
if k == "remove":
|
||||
if isinstance(v, str):
|
||||
scripts.append(f"ctx._source.remove('{v}');")
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
scripts.append(f"int i=ctx._source.{kk}.indexOf(params.p_{kk});ctx._source.{kk}.remove(i);")
|
||||
params[f"p_{kk}"] = vv
|
||||
continue
|
||||
if k == "add":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
scripts.append(f"ctx._source.{kk}.add(params.pp_{kk});")
|
||||
params[f"pp_{kk}"] = vv.strip()
|
||||
continue
|
||||
if (not isinstance(k, str) or not v) and k != "available_int":
|
||||
continue
|
||||
if isinstance(v, str):
|
||||
v = re.sub(r"(['\n\r]|\\.)", " ", v)
|
||||
params[f"pp_{k}"] = v
|
||||
scripts.append(f"ctx._source.{k}=params.pp_{k};")
|
||||
elif isinstance(v, int) or isinstance(v, float):
|
||||
scripts.append(f"ctx._source.{k}={v};")
|
||||
elif isinstance(v, list):
|
||||
scripts.append(f"ctx._source.{k}=params.pp_{k};")
|
||||
params[f"pp_{k}"] = json.dumps(v, ensure_ascii=False)
|
||||
else:
|
||||
raise Exception(
|
||||
f"newValue `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str.")
|
||||
ubq = UpdateByQuery(
|
||||
index=indexName).using(
|
||||
self.es).query(bqry)
|
||||
ubq = ubq.script(source="".join(scripts), params=params)
|
||||
ubq = ubq.params(refresh=True)
|
||||
ubq = ubq.params(slices=5)
|
||||
ubq = ubq.params(conflicts="proceed")
|
||||
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
_ = ubq.execute()
|
||||
return True
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ES request timeout")
|
||||
time.sleep(3)
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error("ESConnection.update got exception: " + str(e) + "\n".join(scripts))
|
||||
break
|
||||
return False
|
||||
|
||||
def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
|
||||
qry = None
|
||||
assert "_id" not in condition
|
||||
condition["kb_id"] = knowledgebaseId
|
||||
if "id" in condition:
|
||||
chunk_ids = condition["id"]
|
||||
if not isinstance(chunk_ids, list):
|
||||
chunk_ids = [chunk_ids]
|
||||
if not chunk_ids: # when chunk_ids is empty, delete all
|
||||
qry = Q("match_all")
|
||||
else:
|
||||
qry = Q("ids", values=chunk_ids)
|
||||
else:
|
||||
qry = Q("bool")
|
||||
for k, v in condition.items():
|
||||
if k == "exists":
|
||||
qry.filter.append(Q("exists", field=v))
|
||||
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
qry.must_not.append(Q("exists", field=vv))
|
||||
|
||||
elif isinstance(v, list):
|
||||
qry.must.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
qry.must.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
logger.debug("ESConnection.delete query: " + json.dumps(qry.to_dict()))
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = self.es.delete_by_query(
|
||||
index=indexName,
|
||||
body=Search().query(qry).to_dict(),
|
||||
refresh=True)
|
||||
return res["deleted"]
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ES request timeout")
|
||||
time.sleep(3)
|
||||
self._connect()
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning("ESConnection.delete got exception: " + str(e))
|
||||
if re.search(r"(not_found)", str(e), re.IGNORECASE):
|
||||
return 0
|
||||
return 0
|
||||
|
||||
"""
|
||||
Helper functions for search result
|
||||
"""
|
||||
|
||||
def getTotal(self, res):
|
||||
if isinstance(res["hits"]["total"], type({})):
|
||||
return res["hits"]["total"]["value"]
|
||||
return res["hits"]["total"]
|
||||
|
||||
def getChunkIds(self, res):
|
||||
return [d["_id"] for d in res["hits"]["hits"]]
|
||||
|
||||
def __getSource(self, res):
|
||||
rr = []
|
||||
for d in res["hits"]["hits"]:
|
||||
d["_source"]["id"] = d["_id"]
|
||||
d["_source"]["_score"] = d["_score"]
|
||||
rr.append(d["_source"])
|
||||
return rr
|
||||
|
||||
def getFields(self, res, fields: list[str]) -> dict[str, dict]:
|
||||
res_fields = {}
|
||||
if not fields:
|
||||
return {}
|
||||
for d in self.__getSource(res):
|
||||
m = {n: d.get(n) for n in fields if d.get(n) is not None}
|
||||
for n, v in m.items():
|
||||
if isinstance(v, list):
|
||||
m[n] = v
|
||||
continue
|
||||
if n == "available_int" and isinstance(v, (int, float)):
|
||||
m[n] = v
|
||||
continue
|
||||
if not isinstance(v, str):
|
||||
m[n] = str(m[n])
|
||||
# if n.find("tks") > 0:
|
||||
# m[n] = rmSpace(m[n])
|
||||
|
||||
if m:
|
||||
res_fields[d["id"]] = m
|
||||
return res_fields
|
||||
|
||||
def getHighlight(self, res, keywords: list[str], fieldnm: str):
|
||||
ans = {}
|
||||
for d in res["hits"]["hits"]:
|
||||
hlts = d.get("highlight")
|
||||
if not hlts:
|
||||
continue
|
||||
txt = "...".join([a for a in list(hlts.items())[0][1]])
|
||||
if not is_english(txt.split()):
|
||||
ans[d["_id"]] = txt
|
||||
continue
|
||||
|
||||
txt = d["_source"][fieldnm]
|
||||
txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
|
||||
txts = []
|
||||
for t in re.split(r"[.?!;\n]", txt):
|
||||
for w in keywords:
|
||||
t = re.sub(r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])" % re.escape(w), r"\1<em>\2</em>\3", t,
|
||||
flags=re.IGNORECASE | re.MULTILINE)
|
||||
if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE):
|
||||
continue
|
||||
txts.append(t)
|
||||
ans[d["_id"]] = "...".join(txts) if txts else "...".join([a for a in list(hlts.items())[0][1]])
|
||||
|
||||
return ans
|
||||
|
||||
def getAggregation(self, res, fieldnm: str):
|
||||
agg_field = "aggs_" + fieldnm
|
||||
if "aggregations" not in res or agg_field not in res["aggregations"]:
|
||||
return list()
|
||||
bkts = res["aggregations"][agg_field]["buckets"]
|
||||
return [(b["key"], b["doc_count"]) for b in bkts]
|
||||
|
||||
"""
|
||||
SQL
|
||||
"""
|
||||
|
||||
def sql(self, sql: str, fetch_size: int, format: str):
|
||||
logger.debug(f"ESConnection.sql get sql: {sql}")
|
||||
sql = re.sub(r"[ `]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
replaces = []
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(3)
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
|
||||
replaces.append(
|
||||
("{}{}'{}'".format(
|
||||
r.group(1),
|
||||
r.group(2),
|
||||
r.group(3)),
|
||||
match))
|
||||
|
||||
for p, r in replaces:
|
||||
sql = sql.replace(p, r, 1)
|
||||
logger.debug(f"ESConnection.sql to es: {sql}")
|
||||
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format,
|
||||
request_timeout="2s")
|
||||
return res
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ES request timeout")
|
||||
time.sleep(3)
|
||||
self._connect()
|
||||
continue
|
||||
except Exception:
|
||||
logger.exception("ESConnection.sql got exception")
|
||||
break
|
||||
logger.error(f"ESConnection.sql timeout for {ATTEMPT_TIME} times!")
|
||||
return None
|
||||
|
||||
def get_cluster_stats(self):
|
||||
"""
|
||||
curl -XGET "http://{es_host}/_cluster/stats" -H "kbn-xsrf: reporting" to view raw stats.
|
||||
"""
|
||||
raw_stats = self.es.cluster.stats()
|
||||
logger.debug(f"ESConnection.get_cluster_stats: {raw_stats}")
|
||||
try:
|
||||
res = {
|
||||
'cluster_name': raw_stats['cluster_name'],
|
||||
'status': raw_stats['status']
|
||||
}
|
||||
indices_status = raw_stats['indices']
|
||||
res.update({
|
||||
'indices': indices_status['count'],
|
||||
'indices_shards': indices_status['shards']['total']
|
||||
})
|
||||
doc_info = indices_status['docs']
|
||||
res.update({
|
||||
'docs': doc_info['count'],
|
||||
'docs_deleted': doc_info['deleted']
|
||||
})
|
||||
store_info = indices_status['store']
|
||||
res.update({
|
||||
'store_size': convert_bytes(store_info['size_in_bytes']),
|
||||
'total_dataset_size': convert_bytes(store_info['total_data_set_size_in_bytes'])
|
||||
})
|
||||
mappings_info = indices_status['mappings']
|
||||
res.update({
|
||||
'mappings_fields': mappings_info['total_field_count'],
|
||||
'mappings_deduplicated_fields': mappings_info['total_deduplicated_field_count'],
|
||||
'mappings_deduplicated_size': convert_bytes(mappings_info['total_deduplicated_mapping_size_in_bytes'])
|
||||
})
|
||||
node_info = raw_stats['nodes']
|
||||
res.update({
|
||||
'nodes': node_info['count']['total'],
|
||||
'nodes_version': node_info['versions'],
|
||||
'os_mem': convert_bytes(node_info['os']['mem']['total_in_bytes']),
|
||||
'os_mem_used': convert_bytes(node_info['os']['mem']['used_in_bytes']),
|
||||
'os_mem_used_percent': node_info['os']['mem']['used_percent'],
|
||||
'jvm_versions': node_info['jvm']['versions'][0]['vm_version'],
|
||||
'jvm_heap_used': convert_bytes(node_info['jvm']['mem']['heap_used_in_bytes']),
|
||||
'jvm_heap_max': convert_bytes(node_info['jvm']['mem']['heap_max_in_bytes'])
|
||||
})
|
||||
return res
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"ESConnection.get_cluster_stats: {e}")
|
||||
return None
|
||||
784
rag/utils/infinity_conn.py
Normal file
784
rag/utils/infinity_conn.py
Normal file
@@ -0,0 +1,784 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import copy
|
||||
import infinity
|
||||
from infinity.common import ConflictType, InfinityException, SortType
|
||||
from infinity.index import IndexInfo, IndexType
|
||||
from infinity.connection_pool import ConnectionPool
|
||||
from infinity.errors import ErrorCode
|
||||
from rag import settings
|
||||
from rag.settings import PAGERANK_FLD, TAG_FLD
|
||||
from rag.utils import singleton
|
||||
import pandas as pd
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from rag.nlp import is_english
|
||||
|
||||
from rag.utils.doc_store_conn import (
|
||||
DocStoreConnection,
|
||||
MatchExpr,
|
||||
MatchTextExpr,
|
||||
MatchDenseExpr,
|
||||
FusionExpr,
|
||||
OrderByExpr,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("ragflow.infinity_conn")
|
||||
|
||||
|
||||
def field_keyword(field_name: str):
|
||||
# The "docnm_kwd" field is always a string, not list.
|
||||
if field_name == "source_id" or (field_name.endswith("_kwd") and field_name != "docnm_kwd" and field_name != "knowledge_graph_kwd"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
|
||||
assert "_id" not in condition
|
||||
clmns = {}
|
||||
if table_instance:
|
||||
for n, ty, de, _ in table_instance.show_columns().rows():
|
||||
clmns[n] = (ty, de)
|
||||
|
||||
def exists(cln):
|
||||
nonlocal clmns
|
||||
assert cln in clmns, f"'{cln}' should be in '{clmns}'."
|
||||
ty, de = clmns[cln]
|
||||
if ty.lower().find("cha"):
|
||||
if not de:
|
||||
de = ""
|
||||
return f" {cln}!='{de}' "
|
||||
return f"{cln}!={de}"
|
||||
|
||||
cond = list()
|
||||
for k, v in condition.items():
|
||||
if not isinstance(k, str) or not v:
|
||||
continue
|
||||
if field_keyword(k):
|
||||
if isinstance(v, list):
|
||||
inCond = list()
|
||||
for item in v:
|
||||
if isinstance(item, str):
|
||||
item = item.replace("'", "''")
|
||||
inCond.append(f"filter_fulltext('{k}', '{item}')")
|
||||
if inCond:
|
||||
strInCond = " or ".join(inCond)
|
||||
strInCond = f"({strInCond})"
|
||||
cond.append(strInCond)
|
||||
else:
|
||||
cond.append(f"filter_fulltext('{k}', '{v}')")
|
||||
elif isinstance(v, list):
|
||||
inCond = list()
|
||||
for item in v:
|
||||
if isinstance(item, str):
|
||||
item = item.replace("'", "''")
|
||||
inCond.append(f"'{item}'")
|
||||
else:
|
||||
inCond.append(str(item))
|
||||
if inCond:
|
||||
strInCond = ", ".join(inCond)
|
||||
strInCond = f"{k} IN ({strInCond})"
|
||||
cond.append(strInCond)
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
cond.append("NOT (%s)" % exists(vv))
|
||||
elif isinstance(v, str):
|
||||
cond.append(f"{k}='{v}'")
|
||||
elif k == "exists":
|
||||
cond.append(exists(v))
|
||||
else:
|
||||
cond.append(f"{k}={str(v)}")
|
||||
return " AND ".join(cond) if cond else "1=1"
|
||||
|
||||
|
||||
def concat_dataframes(df_list: list[pd.DataFrame], selectFields: list[str]) -> pd.DataFrame:
|
||||
df_list2 = [df for df in df_list if not df.empty]
|
||||
if df_list2:
|
||||
return pd.concat(df_list2, axis=0).reset_index(drop=True)
|
||||
|
||||
schema = []
|
||||
for field_name in selectFields:
|
||||
if field_name == "score()": # Workaround: fix schema is changed to score()
|
||||
schema.append("SCORE")
|
||||
elif field_name == "similarity()": # Workaround: fix schema is changed to similarity()
|
||||
schema.append("SIMILARITY")
|
||||
else:
|
||||
schema.append(field_name)
|
||||
return pd.DataFrame(columns=schema)
|
||||
|
||||
|
||||
@singleton
|
||||
class InfinityConnection(DocStoreConnection):
|
||||
def __init__(self):
|
||||
self.dbName = settings.INFINITY.get("db_name", "default_db")
|
||||
infinity_uri = settings.INFINITY["uri"]
|
||||
if ":" in infinity_uri:
|
||||
host, port = infinity_uri.split(":")
|
||||
infinity_uri = infinity.common.NetworkAddress(host, int(port))
|
||||
self.connPool = None
|
||||
logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
|
||||
for _ in range(24):
|
||||
try:
|
||||
connPool = ConnectionPool(infinity_uri, max_size=32)
|
||||
inf_conn = connPool.get_conn()
|
||||
res = inf_conn.show_current_node()
|
||||
if res.error_code == ErrorCode.OK and res.server_status in ["started", "alive"]:
|
||||
self._migrate_db(inf_conn)
|
||||
self.connPool = connPool
|
||||
connPool.release_conn(inf_conn)
|
||||
break
|
||||
connPool.release_conn(inf_conn)
|
||||
logger.warn(f"Infinity status: {res.server_status}. Waiting Infinity {infinity_uri} to be healthy.")
|
||||
time.sleep(5)
|
||||
except Exception as e:
|
||||
logger.warning(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
|
||||
time.sleep(5)
|
||||
if self.connPool is None:
|
||||
msg = f"Infinity {infinity_uri} is unhealthy in 120s."
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
logger.info(f"Infinity {infinity_uri} is healthy.")
|
||||
|
||||
def _migrate_db(self, inf_conn):
|
||||
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "infinity_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
raise Exception(f"Mapping file not found at {fp_mapping}")
|
||||
schema = json.load(open(fp_mapping))
|
||||
table_names = inf_db.list_tables().table_names
|
||||
for table_name in table_names:
|
||||
inf_table = inf_db.get_table(table_name)
|
||||
index_names = inf_table.list_indexes().index_names
|
||||
if "q_vec_idx" not in index_names:
|
||||
# Skip tables not created by me
|
||||
continue
|
||||
column_names = inf_table.show_columns()["name"]
|
||||
column_names = set(column_names)
|
||||
for field_name, field_info in schema.items():
|
||||
if field_name in column_names:
|
||||
continue
|
||||
res = inf_table.add_columns({field_name: field_info})
|
||||
assert res.error_code == infinity.ErrorCode.OK
|
||||
logger.info(f"INFINITY added following column to table {table_name}: {field_name} {field_info}")
|
||||
if field_info["type"] != "varchar" or "analyzer" not in field_info:
|
||||
continue
|
||||
inf_table.create_index(
|
||||
f"text_idx_{field_name}",
|
||||
IndexInfo(field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
|
||||
"""
|
||||
Database operations
|
||||
"""
|
||||
|
||||
def dbType(self) -> str:
|
||||
return "infinity"
|
||||
|
||||
def health(self) -> dict:
|
||||
"""
|
||||
Return the health status of the database.
|
||||
"""
|
||||
inf_conn = self.connPool.get_conn()
|
||||
res = inf_conn.show_current_node()
|
||||
self.connPool.release_conn(inf_conn)
|
||||
res2 = {
|
||||
"type": "infinity",
|
||||
"status": "green" if res.error_code == 0 and res.server_status in ["started", "alive"] else "red",
|
||||
"error": res.error_msg,
|
||||
}
|
||||
return res2
|
||||
|
||||
"""
|
||||
Table operations
|
||||
"""
|
||||
|
||||
def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
inf_conn = self.connPool.get_conn()
|
||||
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
||||
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "infinity_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
raise Exception(f"Mapping file not found at {fp_mapping}")
|
||||
schema = json.load(open(fp_mapping))
|
||||
vector_name = f"q_{vectorSize}_vec"
|
||||
schema[vector_name] = {"type": f"vector,{vectorSize},float"}
|
||||
inf_table = inf_db.create_table(
|
||||
table_name,
|
||||
schema,
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
inf_table.create_index(
|
||||
"q_vec_idx",
|
||||
IndexInfo(
|
||||
vector_name,
|
||||
IndexType.Hnsw,
|
||||
{
|
||||
"M": "16",
|
||||
"ef_construction": "50",
|
||||
"metric": "cosine",
|
||||
"encode": "lvq",
|
||||
},
|
||||
),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
for field_name, field_info in schema.items():
|
||||
if field_info["type"] != "varchar" or "analyzer" not in field_info:
|
||||
continue
|
||||
inf_table.create_index(
|
||||
f"text_idx_{field_name}",
|
||||
IndexInfo(field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}),
|
||||
ConflictType.Ignore,
|
||||
)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
logger.info(f"INFINITY created table {table_name}, vector size {vectorSize}")
|
||||
|
||||
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
db_instance.drop_table(table_name, ConflictType.Ignore)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
logger.info(f"INFINITY dropped table {table_name}")
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
try:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
_ = db_instance.get_table(table_name)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"INFINITY indexExist {str(e)}")
|
||||
return False
|
||||
|
||||
"""
|
||||
CRUD operations
|
||||
"""
|
||||
|
||||
def search(
|
||||
self,
|
||||
selectFields: list[str],
|
||||
highlightFields: list[str],
|
||||
condition: dict,
|
||||
matchExprs: list[MatchExpr],
|
||||
orderBy: OrderByExpr,
|
||||
offset: int,
|
||||
limit: int,
|
||||
indexNames: str | list[str],
|
||||
knowledgebaseIds: list[str],
|
||||
aggFields: list[str] = [],
|
||||
rank_feature: dict | None = None,
|
||||
) -> tuple[pd.DataFrame, int]:
|
||||
"""
|
||||
BUG: Infinity returns empty for a highlight field if the query string doesn't use that field.
|
||||
"""
|
||||
if isinstance(indexNames, str):
|
||||
indexNames = indexNames.split(",")
|
||||
assert isinstance(indexNames, list) and len(indexNames) > 0
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
df_list = list()
|
||||
table_list = list()
|
||||
output = selectFields.copy()
|
||||
for essential_field in ["id"] + aggFields:
|
||||
if essential_field not in output:
|
||||
output.append(essential_field)
|
||||
score_func = ""
|
||||
score_column = ""
|
||||
for matchExpr in matchExprs:
|
||||
if isinstance(matchExpr, MatchTextExpr):
|
||||
score_func = "score()"
|
||||
score_column = "SCORE"
|
||||
break
|
||||
if not score_func:
|
||||
for matchExpr in matchExprs:
|
||||
if isinstance(matchExpr, MatchDenseExpr):
|
||||
score_func = "similarity()"
|
||||
score_column = "SIMILARITY"
|
||||
break
|
||||
if matchExprs:
|
||||
if score_func not in output:
|
||||
output.append(score_func)
|
||||
if PAGERANK_FLD not in output:
|
||||
output.append(PAGERANK_FLD)
|
||||
output = [f for f in output if f != "_score"]
|
||||
if limit <= 0:
|
||||
# ElasticSearch default limit is 10000
|
||||
limit = 10000
|
||||
|
||||
# Prepare expressions common to all tables
|
||||
filter_cond = None
|
||||
filter_fulltext = ""
|
||||
if condition:
|
||||
table_found = False
|
||||
for indexName in indexNames:
|
||||
for kb_id in knowledgebaseIds:
|
||||
table_name = f"{indexName}_{kb_id}"
|
||||
try:
|
||||
filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
|
||||
table_found = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if table_found:
|
||||
break
|
||||
if not table_found:
|
||||
logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}")
|
||||
return pd.DataFrame(), 0
|
||||
|
||||
for matchExpr in matchExprs:
|
||||
if isinstance(matchExpr, MatchTextExpr):
|
||||
if filter_cond and "filter" not in matchExpr.extra_options:
|
||||
matchExpr.extra_options.update({"filter": filter_cond})
|
||||
fields = ",".join(matchExpr.fields)
|
||||
filter_fulltext = f"filter_fulltext('{fields}', '{matchExpr.matching_text}')"
|
||||
if filter_cond:
|
||||
filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
|
||||
minimum_should_match = matchExpr.extra_options.get("minimum_should_match", 0.0)
|
||||
if isinstance(minimum_should_match, float):
|
||||
str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
|
||||
matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
|
||||
|
||||
# Add rank_feature support
|
||||
if rank_feature and "rank_features" not in matchExpr.extra_options:
|
||||
# Convert rank_feature dict to Infinity's rank_features string format
|
||||
# Format: "field^feature_name^weight,field^feature_name^weight"
|
||||
rank_features_list = []
|
||||
for feature_name, weight in rank_feature.items():
|
||||
# Use TAG_FLD as the field containing rank features
|
||||
rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}")
|
||||
if rank_features_list:
|
||||
matchExpr.extra_options["rank_features"] = ",".join(rank_features_list)
|
||||
|
||||
for k, v in matchExpr.extra_options.items():
|
||||
if not isinstance(v, str):
|
||||
matchExpr.extra_options[k] = str(v)
|
||||
logger.debug(f"INFINITY search MatchTextExpr: {json.dumps(matchExpr.__dict__)}")
|
||||
elif isinstance(matchExpr, MatchDenseExpr):
|
||||
if filter_fulltext and "filter" not in matchExpr.extra_options:
|
||||
matchExpr.extra_options.update({"filter": filter_fulltext})
|
||||
for k, v in matchExpr.extra_options.items():
|
||||
if not isinstance(v, str):
|
||||
matchExpr.extra_options[k] = str(v)
|
||||
similarity = matchExpr.extra_options.get("similarity")
|
||||
if similarity:
|
||||
matchExpr.extra_options["threshold"] = similarity
|
||||
del matchExpr.extra_options["similarity"]
|
||||
logger.debug(f"INFINITY search MatchDenseExpr: {json.dumps(matchExpr.__dict__)}")
|
||||
elif isinstance(matchExpr, FusionExpr):
|
||||
logger.debug(f"INFINITY search FusionExpr: {json.dumps(matchExpr.__dict__)}")
|
||||
|
||||
order_by_expr_list = list()
|
||||
if orderBy.fields:
|
||||
for order_field in orderBy.fields:
|
||||
if order_field[1] == 0:
|
||||
order_by_expr_list.append((order_field[0], SortType.Asc))
|
||||
else:
|
||||
order_by_expr_list.append((order_field[0], SortType.Desc))
|
||||
|
||||
total_hits_count = 0
|
||||
# Scatter search tables and gather the results
|
||||
for indexName in indexNames:
|
||||
for knowledgebaseId in knowledgebaseIds:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
continue
|
||||
table_list.append(table_name)
|
||||
builder = table_instance.output(output)
|
||||
if len(matchExprs) > 0:
|
||||
for matchExpr in matchExprs:
|
||||
if isinstance(matchExpr, MatchTextExpr):
|
||||
fields = ",".join(matchExpr.fields)
|
||||
builder = builder.match_text(
|
||||
fields,
|
||||
matchExpr.matching_text,
|
||||
matchExpr.topn,
|
||||
matchExpr.extra_options.copy(),
|
||||
)
|
||||
elif isinstance(matchExpr, MatchDenseExpr):
|
||||
builder = builder.match_dense(
|
||||
matchExpr.vector_column_name,
|
||||
matchExpr.embedding_data,
|
||||
matchExpr.embedding_data_type,
|
||||
matchExpr.distance_type,
|
||||
matchExpr.topn,
|
||||
matchExpr.extra_options.copy(),
|
||||
)
|
||||
elif isinstance(matchExpr, FusionExpr):
|
||||
builder = builder.fusion(matchExpr.method, matchExpr.topn, matchExpr.fusion_params)
|
||||
else:
|
||||
if filter_cond and len(filter_cond) > 0:
|
||||
builder.filter(filter_cond)
|
||||
if orderBy.fields:
|
||||
builder.sort(order_by_expr_list)
|
||||
builder.offset(offset).limit(limit)
|
||||
kb_res, extra_result = builder.option({"total_hits_count": True}).to_df()
|
||||
if extra_result:
|
||||
total_hits_count += int(extra_result["total_hits_count"])
|
||||
logger.debug(f"INFINITY search table: {str(table_name)}, result: {str(kb_res)}")
|
||||
df_list.append(kb_res)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
res = concat_dataframes(df_list, output)
|
||||
if matchExprs:
|
||||
res["Sum"] = res[score_column] + res[PAGERANK_FLD]
|
||||
res = res.sort_values(by="Sum", ascending=False).reset_index(drop=True).drop(columns=["Sum"])
|
||||
res = res.head(limit)
|
||||
logger.debug(f"INFINITY search final result: {str(res)}")
|
||||
return res, total_hits_count
|
||||
|
||||
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
df_list = list()
|
||||
assert isinstance(knowledgebaseIds, list)
|
||||
table_list = list()
|
||||
for knowledgebaseId in knowledgebaseIds:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
table_list.append(table_name)
|
||||
table_instance = None
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
logger.warning(f"Table not found: {table_name}, this knowledge base isn't created in Infinity. Maybe it is created in other document engine.")
|
||||
continue
|
||||
kb_res, _ = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_df()
|
||||
logger.debug(f"INFINITY get table: {str(table_list)}, result: {str(kb_res)}")
|
||||
df_list.append(kb_res)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
res = concat_dataframes(df_list, ["id"])
|
||||
res_fields = self.getFields(res, res.columns.tolist())
|
||||
return res_fields.get(chunkId, None)
|
||||
|
||||
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except InfinityException as e:
|
||||
# src/common/status.cppm, kTableNotExist = 3022
|
||||
if e.error_code != ErrorCode.TABLE_NOT_EXIST:
|
||||
raise
|
||||
vector_size = 0
|
||||
patt = re.compile(r"q_(?P<vector_size>\d+)_vec")
|
||||
for k in documents[0].keys():
|
||||
m = patt.match(k)
|
||||
if m:
|
||||
vector_size = int(m.group("vector_size"))
|
||||
break
|
||||
if vector_size == 0:
|
||||
raise ValueError("Cannot infer vector size from documents")
|
||||
self.createIdx(indexName, knowledgebaseId, vector_size)
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
|
||||
# embedding fields can't have a default value....
|
||||
embedding_clmns = []
|
||||
clmns = table_instance.show_columns().rows()
|
||||
for n, ty, _, _ in clmns:
|
||||
r = re.search(r"Embedding\([a-z]+,([0-9]+)\)", ty)
|
||||
if not r:
|
||||
continue
|
||||
embedding_clmns.append((n, int(r.group(1))))
|
||||
|
||||
docs = copy.deepcopy(documents)
|
||||
for d in docs:
|
||||
assert "_id" not in d
|
||||
assert "id" in d
|
||||
for k, v in d.items():
|
||||
if field_keyword(k):
|
||||
if isinstance(v, list):
|
||||
d[k] = "###".join(v)
|
||||
else:
|
||||
d[k] = v
|
||||
elif re.search(r"_feas$", k):
|
||||
d[k] = json.dumps(v)
|
||||
elif k == "kb_id":
|
||||
if isinstance(d[k], list):
|
||||
d[k] = d[k][0] # since d[k] is a list, but we need a str
|
||||
elif k == "position_int":
|
||||
assert isinstance(v, list)
|
||||
arr = [num for row in v for num in row]
|
||||
d[k] = "_".join(f"{num:08x}" for num in arr)
|
||||
elif k in ["page_num_int", "top_int"]:
|
||||
assert isinstance(v, list)
|
||||
d[k] = "_".join(f"{num:08x}" for num in v)
|
||||
else:
|
||||
d[k] = v
|
||||
|
||||
for n, vs in embedding_clmns:
|
||||
if n in d:
|
||||
continue
|
||||
d[n] = [0] * vs
|
||||
ids = ["'{}'".format(d["id"]) for d in docs]
|
||||
str_ids = ", ".join(ids)
|
||||
str_filter = f"id IN ({str_ids})"
|
||||
table_instance.delete(str_filter)
|
||||
# for doc in documents:
|
||||
# logger.info(f"insert position_int: {doc['position_int']}")
|
||||
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
|
||||
table_instance.insert(docs)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
logger.debug(f"INFINITY inserted into {table_name} {str_ids}.")
|
||||
return []
|
||||
|
||||
def update(self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str) -> bool:
|
||||
# if 'position_int' in newValue:
|
||||
# logger.info(f"update position_int: {newValue['position_int']}")
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
# if "exists" in condition:
|
||||
# del condition["exists"]
|
||||
|
||||
clmns = {}
|
||||
if table_instance:
|
||||
for n, ty, de, _ in table_instance.show_columns().rows():
|
||||
clmns[n] = (ty, de)
|
||||
filter = equivalent_condition_to_str(condition, table_instance)
|
||||
removeValue = {}
|
||||
for k, v in list(newValue.items()):
|
||||
if field_keyword(k):
|
||||
if isinstance(v, list):
|
||||
newValue[k] = "###".join(v)
|
||||
else:
|
||||
newValue[k] = v
|
||||
elif re.search(r"_feas$", k):
|
||||
newValue[k] = json.dumps(v)
|
||||
elif k == "kb_id":
|
||||
if isinstance(newValue[k], list):
|
||||
newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
|
||||
elif k == "position_int":
|
||||
assert isinstance(v, list)
|
||||
arr = [num for row in v for num in row]
|
||||
newValue[k] = "_".join(f"{num:08x}" for num in arr)
|
||||
elif k in ["page_num_int", "top_int"]:
|
||||
assert isinstance(v, list)
|
||||
newValue[k] = "_".join(f"{num:08x}" for num in v)
|
||||
elif k == "remove":
|
||||
if isinstance(v, str):
|
||||
assert v in clmns, f"'{v}' should be in '{clmns}'."
|
||||
ty, de = clmns[v]
|
||||
if ty.lower().find("cha"):
|
||||
if not de:
|
||||
de = ""
|
||||
newValue[v] = de
|
||||
else:
|
||||
for kk, vv in v.items():
|
||||
removeValue[kk] = vv
|
||||
del newValue[k]
|
||||
else:
|
||||
newValue[k] = v
|
||||
|
||||
remove_opt = {} # "[k,new_value]": [id_to_update, ...]
|
||||
if removeValue:
|
||||
col_to_remove = list(removeValue.keys())
|
||||
row_to_opt = table_instance.output(col_to_remove + ["id"]).filter(filter).to_df()
|
||||
logger.debug(f"INFINITY search table {str(table_name)}, filter {filter}, result: {str(row_to_opt[0])}")
|
||||
row_to_opt = self.getFields(row_to_opt, col_to_remove)
|
||||
for id, old_v in row_to_opt.items():
|
||||
for k, remove_v in removeValue.items():
|
||||
if remove_v in old_v[k]:
|
||||
new_v = old_v[k].copy()
|
||||
new_v.remove(remove_v)
|
||||
kv_key = json.dumps([k, new_v])
|
||||
if kv_key not in remove_opt:
|
||||
remove_opt[kv_key] = [id]
|
||||
else:
|
||||
remove_opt[kv_key].append(id)
|
||||
|
||||
logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
|
||||
for update_kv, ids in remove_opt.items():
|
||||
k, v = json.loads(update_kv)
|
||||
table_instance.update(filter + " AND id in ({0})".format(",".join([f"'{id}'" for id in ids])), {k: "###".join(v)})
|
||||
|
||||
table_instance.update(filter, newValue)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
return True
|
||||
|
||||
def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
logger.warning(f"Skipped deleting from table {table_name} since the table doesn't exist.")
|
||||
return 0
|
||||
filter = equivalent_condition_to_str(condition, table_instance)
|
||||
logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
|
||||
res = table_instance.delete(filter)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
return res.deleted_rows
|
||||
|
||||
"""
|
||||
Helper functions for search result
|
||||
"""
|
||||
|
||||
def getTotal(self, res: tuple[pd.DataFrame, int] | pd.DataFrame) -> int:
|
||||
if isinstance(res, tuple):
|
||||
return res[1]
|
||||
return len(res)
|
||||
|
||||
def getChunkIds(self, res: tuple[pd.DataFrame, int] | pd.DataFrame) -> list[str]:
|
||||
if isinstance(res, tuple):
|
||||
res = res[0]
|
||||
return list(res["id"])
|
||||
|
||||
def getFields(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fields: list[str]) -> dict[str, dict]:
|
||||
if isinstance(res, tuple):
|
||||
res = res[0]
|
||||
if not fields:
|
||||
return {}
|
||||
fieldsAll = fields.copy()
|
||||
fieldsAll.append("id")
|
||||
column_map = {col.lower(): col for col in res.columns}
|
||||
matched_columns = {column_map[col.lower()]: col for col in set(fieldsAll) if col.lower() in column_map}
|
||||
none_columns = [col for col in set(fieldsAll) if col.lower() not in column_map]
|
||||
|
||||
res2 = res[matched_columns.keys()]
|
||||
res2 = res2.rename(columns=matched_columns)
|
||||
res2.drop_duplicates(subset=["id"], inplace=True)
|
||||
|
||||
for column in res2.columns:
|
||||
k = column.lower()
|
||||
if field_keyword(k):
|
||||
res2[column] = res2[column].apply(lambda v: [kwd for kwd in v.split("###") if kwd])
|
||||
elif re.search(r"_feas$", k):
|
||||
res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
|
||||
elif k == "position_int":
|
||||
|
||||
def to_position_int(v):
|
||||
if v:
|
||||
arr = [int(hex_val, 16) for hex_val in v.split("_")]
|
||||
v = [arr[i : i + 5] for i in range(0, len(arr), 5)]
|
||||
else:
|
||||
v = []
|
||||
return v
|
||||
|
||||
res2[column] = res2[column].apply(to_position_int)
|
||||
elif k in ["page_num_int", "top_int"]:
|
||||
res2[column] = res2[column].apply(lambda v: [int(hex_val, 16) for hex_val in v.split("_")] if v else [])
|
||||
else:
|
||||
pass
|
||||
for column in none_columns:
|
||||
res2[column] = None
|
||||
|
||||
return res2.set_index("id").to_dict(orient="index")
|
||||
|
||||
def getHighlight(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, keywords: list[str], fieldnm: str):
|
||||
if isinstance(res, tuple):
|
||||
res = res[0]
|
||||
ans = {}
|
||||
num_rows = len(res)
|
||||
column_id = res["id"]
|
||||
if fieldnm not in res:
|
||||
return {}
|
||||
for i in range(num_rows):
|
||||
id = column_id[i]
|
||||
txt = res[fieldnm][i]
|
||||
if re.search(r"<em>[^<>]+</em>", txt, flags=re.IGNORECASE | re.MULTILINE):
|
||||
ans[id] = txt
|
||||
continue
|
||||
txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
|
||||
txts = []
|
||||
for t in re.split(r"[.?!;\n]", txt):
|
||||
if is_english([t]):
|
||||
for w in keywords:
|
||||
t = re.sub(
|
||||
r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])" % re.escape(w),
|
||||
r"\1<em>\2</em>\3",
|
||||
t,
|
||||
flags=re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
else:
|
||||
for w in sorted(keywords, key=len, reverse=True):
|
||||
t = re.sub(
|
||||
re.escape(w),
|
||||
f"<em>{w}</em>",
|
||||
t,
|
||||
flags=re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE):
|
||||
continue
|
||||
txts.append(t)
|
||||
if txts:
|
||||
ans[id] = "...".join(txts)
|
||||
else:
|
||||
ans[id] = txt
|
||||
return ans
|
||||
|
||||
def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
|
||||
"""
|
||||
Manual aggregation for tag fields since Infinity doesn't provide native aggregation
|
||||
"""
|
||||
from collections import Counter
|
||||
|
||||
# Extract DataFrame from result
|
||||
if isinstance(res, tuple):
|
||||
df, _ = res
|
||||
else:
|
||||
df = res
|
||||
|
||||
if df.empty or fieldnm not in df.columns:
|
||||
return []
|
||||
|
||||
# Aggregate tag counts
|
||||
tag_counter = Counter()
|
||||
|
||||
for value in df[fieldnm]:
|
||||
if pd.isna(value) or not value:
|
||||
continue
|
||||
|
||||
# Handle different tag formats
|
||||
if isinstance(value, str):
|
||||
# Split by ### for tag_kwd field or comma for other formats
|
||||
if fieldnm == "tag_kwd" and "###" in value:
|
||||
tags = [tag.strip() for tag in value.split("###") if tag.strip()]
|
||||
else:
|
||||
# Try comma separation as fallback
|
||||
tags = [tag.strip() for tag in value.split(",") if tag.strip()]
|
||||
|
||||
for tag in tags:
|
||||
if tag: # Only count non-empty tags
|
||||
tag_counter[tag] += 1
|
||||
elif isinstance(value, list):
|
||||
# Handle list format
|
||||
for tag in value:
|
||||
if tag and isinstance(tag, str):
|
||||
tag_counter[tag.strip()] += 1
|
||||
|
||||
# Return as list of [tag, count] pairs, sorted by count descending
|
||||
return [[tag, count] for tag, count in tag_counter.most_common()]
|
||||
|
||||
"""
|
||||
SQL
|
||||
"""
|
||||
|
||||
def sql(sql: str, fetch_size: int, format: str):
|
||||
raise NotImplementedError("Not implemented")
|
||||
261
rag/utils/mcp_tool_call_conn.py
Normal file
261
rag/utils/mcp_tool_call_conn.py
Normal file
@@ -0,0 +1,261 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import weakref
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from concurrent.futures import TimeoutError as FuturesTimeoutError
|
||||
from string import Template
|
||||
from typing import Any, Literal
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from api.db import MCPServerType
|
||||
from mcp.client.session import ClientSession
|
||||
from mcp.client.sse import sse_client
|
||||
from mcp.client.streamable_http import streamablehttp_client
|
||||
from mcp.types import CallToolResult, ListToolsResult, TextContent, Tool
|
||||
from rag.llm.chat_model import ToolCallSession
|
||||
|
||||
MCPTaskType = Literal["list_tools", "tool_call"]
|
||||
MCPTask = tuple[MCPTaskType, dict[str, Any], asyncio.Queue[Any]]
|
||||
|
||||
|
||||
class MCPToolCallSession(ToolCallSession):
|
||||
_ALL_INSTANCES: weakref.WeakSet["MCPToolCallSession"] = weakref.WeakSet()
|
||||
|
||||
def __init__(self, mcp_server: Any, server_variables: dict[str, Any] | None = None) -> None:
|
||||
self.__class__._ALL_INSTANCES.add(self)
|
||||
|
||||
self._mcp_server = mcp_server
|
||||
self._server_variables = server_variables or {}
|
||||
self._queue = asyncio.Queue()
|
||||
self._close = False
|
||||
|
||||
self._event_loop = asyncio.new_event_loop()
|
||||
self._thread_pool = ThreadPoolExecutor(max_workers=1)
|
||||
self._thread_pool.submit(self._event_loop.run_forever)
|
||||
|
||||
asyncio.run_coroutine_threadsafe(self._mcp_server_loop(), self._event_loop)
|
||||
|
||||
async def _mcp_server_loop(self) -> None:
|
||||
url = self._mcp_server.url.strip()
|
||||
raw_headers: dict[str, str] = self._mcp_server.headers or {}
|
||||
headers: dict[str, str] = {}
|
||||
|
||||
for h, v in raw_headers.items():
|
||||
nh = Template(h).safe_substitute(self._server_variables)
|
||||
nv = Template(v).safe_substitute(self._server_variables)
|
||||
headers[nh] = nv
|
||||
|
||||
if self._mcp_server.server_type == MCPServerType.SSE:
|
||||
# SSE transport
|
||||
try:
|
||||
async with sse_client(url, headers) as stream:
|
||||
async with ClientSession(*stream) as client_session:
|
||||
try:
|
||||
await asyncio.wait_for(client_session.initialize(), timeout=5)
|
||||
logging.info("client_session initialized successfully")
|
||||
await self._process_mcp_tasks(client_session)
|
||||
except asyncio.TimeoutError:
|
||||
msg = f"Timeout initializing client_session for server {self._mcp_server.id}"
|
||||
logging.error(msg)
|
||||
await self._process_mcp_tasks(None, msg)
|
||||
except Exception:
|
||||
msg = "Connection failed (possibly due to auth error). Please check authentication settings first"
|
||||
await self._process_mcp_tasks(None, msg)
|
||||
|
||||
elif self._mcp_server.server_type == MCPServerType.STREAMABLE_HTTP:
|
||||
# Streamable HTTP transport
|
||||
try:
|
||||
async with streamablehttp_client(url, headers) as (read_stream, write_stream, _):
|
||||
async with ClientSession(read_stream, write_stream) as client_session:
|
||||
try:
|
||||
await asyncio.wait_for(client_session.initialize(), timeout=5)
|
||||
logging.info("client_session initialized successfully")
|
||||
await self._process_mcp_tasks(client_session)
|
||||
except asyncio.TimeoutError:
|
||||
msg = f"Timeout initializing client_session for server {self._mcp_server.id}"
|
||||
logging.error(msg)
|
||||
await self._process_mcp_tasks(None, msg)
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
msg = "Connection failed (possibly due to auth error). Please check authentication settings first"
|
||||
await self._process_mcp_tasks(None, msg)
|
||||
|
||||
else:
|
||||
await self._process_mcp_tasks(None, f"Unsupported MCP server type: {self._mcp_server.server_type}, id: {self._mcp_server.id}")
|
||||
|
||||
async def _process_mcp_tasks(self, client_session: ClientSession | None, error_message: str | None = None) -> None:
|
||||
while not self._close:
|
||||
try:
|
||||
mcp_task, arguments, result_queue = await asyncio.wait_for(self._queue.get(), timeout=1)
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
|
||||
logging.debug(f"Got MCP task {mcp_task} arguments {arguments}")
|
||||
|
||||
r: Any = None
|
||||
|
||||
if not client_session or error_message:
|
||||
r = ValueError(error_message)
|
||||
await result_queue.put(r)
|
||||
continue
|
||||
|
||||
try:
|
||||
if mcp_task == "list_tools":
|
||||
r = await client_session.list_tools()
|
||||
elif mcp_task == "tool_call":
|
||||
r = await client_session.call_tool(**arguments)
|
||||
else:
|
||||
r = ValueError(f"Unknown MCP task {mcp_task}")
|
||||
except Exception as e:
|
||||
r = e
|
||||
|
||||
await result_queue.put(r)
|
||||
|
||||
async def _call_mcp_server(self, task_type: MCPTaskType, timeout: float | int = 8, **kwargs) -> Any:
|
||||
results = asyncio.Queue()
|
||||
await self._queue.put((task_type, kwargs, results))
|
||||
|
||||
try:
|
||||
result: CallToolResult | Exception = await asyncio.wait_for(results.get(), timeout=timeout)
|
||||
if isinstance(result, Exception):
|
||||
raise result
|
||||
return result
|
||||
except asyncio.TimeoutError:
|
||||
raise asyncio.TimeoutError(f"MCP task '{task_type}' timeout after {timeout}s")
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
async def _call_mcp_tool(self, name: str, arguments: dict[str, Any], timeout: float | int = 10) -> str:
|
||||
result: CallToolResult = await self._call_mcp_server("tool_call", name=name, arguments=arguments, timeout=timeout)
|
||||
|
||||
if result.isError:
|
||||
return f"MCP server error: {result.content}"
|
||||
|
||||
# For now, we only support text content
|
||||
if isinstance(result.content[0], TextContent):
|
||||
return result.content[0].text
|
||||
else:
|
||||
return f"Unsupported content type {type(result.content)}"
|
||||
|
||||
async def _get_tools_from_mcp_server(self, timeout: float | int = 8) -> list[Tool]:
|
||||
try:
|
||||
result: ListToolsResult = await self._call_mcp_server("list_tools", timeout=timeout)
|
||||
return result.tools
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
def get_tools(self, timeout: float | int = 10) -> list[Tool]:
|
||||
future = asyncio.run_coroutine_threadsafe(self._get_tools_from_mcp_server(timeout=timeout), self._event_loop)
|
||||
try:
|
||||
return future.result(timeout=timeout)
|
||||
except FuturesTimeoutError:
|
||||
msg = f"Timeout when fetching tools from MCP server: {self._mcp_server.id} (timeout={timeout})"
|
||||
logging.error(msg)
|
||||
raise RuntimeError(msg)
|
||||
except Exception:
|
||||
logging.exception(f"Error fetching tools from MCP server: {self._mcp_server.id}")
|
||||
raise
|
||||
|
||||
@override
|
||||
def tool_call(self, name: str, arguments: dict[str, Any], timeout: float | int = 10) -> str:
|
||||
future = asyncio.run_coroutine_threadsafe(self._call_mcp_tool(name, arguments), self._event_loop)
|
||||
try:
|
||||
return future.result(timeout=timeout)
|
||||
except FuturesTimeoutError:
|
||||
logging.error(f"Timeout calling tool '{name}' on MCP server: {self._mcp_server.id} (timeout={timeout})")
|
||||
return f"Timeout calling tool '{name}' (timeout={timeout})."
|
||||
except Exception as e:
|
||||
logging.exception(f"Error calling tool '{name}' on MCP server: {self._mcp_server.id}")
|
||||
return f"Error calling tool '{name}': {e}."
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._close:
|
||||
return
|
||||
|
||||
self._close = True
|
||||
self._event_loop.call_soon_threadsafe(self._event_loop.stop)
|
||||
self._thread_pool.shutdown(wait=True)
|
||||
self.__class__._ALL_INSTANCES.discard(self)
|
||||
|
||||
def close_sync(self, timeout: float | int = 5) -> None:
|
||||
if not self._event_loop.is_running():
|
||||
logging.warning(f"Event loop already stopped for {self._mcp_server.id}")
|
||||
return
|
||||
|
||||
future = asyncio.run_coroutine_threadsafe(self.close(), self._event_loop)
|
||||
try:
|
||||
future.result(timeout=timeout)
|
||||
except FuturesTimeoutError:
|
||||
logging.error(f"Timeout while closing session for server {self._mcp_server.id} (timeout={timeout})")
|
||||
except Exception:
|
||||
logging.exception(f"Unexpected error during close_sync for {self._mcp_server.id}")
|
||||
|
||||
|
||||
def close_multiple_mcp_toolcall_sessions(sessions: list[MCPToolCallSession]) -> None:
|
||||
logging.info(f"Want to clean up {len(sessions)} MCP sessions")
|
||||
|
||||
async def _gather_and_stop() -> None:
|
||||
try:
|
||||
await asyncio.gather(*[s.close() for s in sessions if s is not None], return_exceptions=True)
|
||||
finally:
|
||||
loop.call_soon_threadsafe(loop.stop)
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
thread = threading.Thread(target=loop.run_forever, daemon=True)
|
||||
thread.start()
|
||||
|
||||
asyncio.run_coroutine_threadsafe(_gather_and_stop(), loop).result()
|
||||
|
||||
thread.join()
|
||||
logging.info(f"{len(sessions)} MCP sessions has been cleaned up. {len(list(MCPToolCallSession._ALL_INSTANCES))} in global context.")
|
||||
|
||||
|
||||
def shutdown_all_mcp_sessions():
|
||||
"""Gracefully shutdown all active MCPToolCallSession instances."""
|
||||
sessions = list(MCPToolCallSession._ALL_INSTANCES)
|
||||
if not sessions:
|
||||
logging.info("No MCPToolCallSession instances to close.")
|
||||
return
|
||||
|
||||
logging.info(f"Shutting down {len(sessions)} MCPToolCallSession instances...")
|
||||
close_multiple_mcp_toolcall_sessions(sessions)
|
||||
logging.info("All MCPToolCallSession instances have been closed.")
|
||||
|
||||
|
||||
def mcp_tool_metadata_to_openai_tool(mcp_tool: Tool|dict) -> dict[str, Any]:
|
||||
if isinstance(mcp_tool, dict):
|
||||
return {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": mcp_tool["name"],
|
||||
"description": mcp_tool["description"],
|
||||
"parameters": mcp_tool["inputSchema"],
|
||||
},
|
||||
}
|
||||
|
||||
return {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": mcp_tool.name,
|
||||
"description": mcp_tool.description,
|
||||
"parameters": mcp_tool.inputSchema,
|
||||
},
|
||||
}
|
||||
143
rag/utils/minio_conn.py
Normal file
143
rag/utils/minio_conn.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import time
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
from io import BytesIO
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
|
||||
|
||||
@singleton
|
||||
class RAGFlowMinio:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.__open__()
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
self.conn = Minio(settings.MINIO["host"],
|
||||
access_key=settings.MINIO["user"],
|
||||
secret_key=settings.MINIO["password"],
|
||||
secure=False
|
||||
)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"Fail to connect %s " % settings.MINIO["host"])
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
self.conn = None
|
||||
|
||||
def health(self):
|
||||
bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
self.conn.make_bucket(bucket)
|
||||
r = self.conn.put_object(bucket, fnm,
|
||||
BytesIO(binary),
|
||||
len(binary)
|
||||
)
|
||||
return r
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
for _ in range(3):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
self.conn.make_bucket(bucket)
|
||||
|
||||
r = self.conn.put_object(bucket, fnm,
|
||||
BytesIO(binary),
|
||||
len(binary)
|
||||
)
|
||||
return r
|
||||
except Exception:
|
||||
logging.exception(f"Fail to put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.remove_object(bucket, fnm)
|
||||
except Exception:
|
||||
logging.exception(f"Fail to remove {bucket}/{fnm}:")
|
||||
|
||||
def get(self, bucket, filename):
|
||||
for _ in range(1):
|
||||
try:
|
||||
r = self.conn.get_object(bucket, filename)
|
||||
return r.read()
|
||||
except Exception:
|
||||
logging.exception(f"Fail to get {bucket}/{filename}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
def obj_exist(self, bucket, filename):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
return False
|
||||
if self.conn.stat_object(bucket, filename):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except S3Error as e:
|
||||
if e.code in ["NoSuchKey", "NoSuchBucket", "ResourceNotFound"]:
|
||||
return False
|
||||
except Exception:
|
||||
logging.exception(f"obj_exist {bucket}/{filename} got exception")
|
||||
return False
|
||||
|
||||
def bucket_exists(self, bucket):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
except S3Error as e:
|
||||
if e.code in ["NoSuchKey", "NoSuchBucket", "ResourceNotFound"]:
|
||||
return False
|
||||
except Exception:
|
||||
logging.exception(f"bucket_exist {bucket} got exception")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception:
|
||||
logging.exception(f"Fail to get_presigned {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
def remove_bucket(self, bucket):
|
||||
try:
|
||||
if self.conn.bucket_exists(bucket):
|
||||
objects_to_delete = self.conn.list_objects(bucket, recursive=True)
|
||||
for obj in objects_to_delete:
|
||||
self.conn.remove_object(bucket, obj.object_name)
|
||||
self.conn.remove_bucket(bucket)
|
||||
except Exception:
|
||||
logging.exception(f"Fail to remove bucket {bucket}")
|
||||
|
||||
118
rag/utils/opendal_conn.py
Normal file
118
rag/utils/opendal_conn.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import opendal
|
||||
import logging
|
||||
import pymysql
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from api.utils.configs import get_base_config
|
||||
from rag.utils import singleton
|
||||
|
||||
|
||||
CREATE_TABLE_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS `{}` (
|
||||
`key` VARCHAR(255) PRIMARY KEY,
|
||||
`value` LONGBLOB,
|
||||
`created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
`updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
|
||||
);
|
||||
"""
|
||||
SET_MAX_ALLOWED_PACKET_SQL = """
|
||||
SET GLOBAL max_allowed_packet={}
|
||||
"""
|
||||
|
||||
|
||||
def get_opendal_config():
|
||||
try:
|
||||
opendal_config = get_base_config('opendal', {})
|
||||
if opendal_config.get("scheme", "mysql") == 'mysql':
|
||||
mysql_config = get_base_config('mysql', {})
|
||||
max_packet = mysql_config.get("max_allowed_packet", 134217728)
|
||||
kwargs = {
|
||||
"scheme": "mysql",
|
||||
"host": mysql_config.get("host", "127.0.0.1"),
|
||||
"port": str(mysql_config.get("port", 3306)),
|
||||
"user": mysql_config.get("user", "root"),
|
||||
"password": mysql_config.get("password", ""),
|
||||
"database": mysql_config.get("name", "test_open_dal"),
|
||||
"table": opendal_config.get("config", {}).get("oss_table", "opendal_storage"),
|
||||
"max_allowed_packet": str(max_packet)
|
||||
}
|
||||
kwargs["connection_string"] = f"mysql://{kwargs['user']}:{quote_plus(kwargs['password'])}@{kwargs['host']}:{kwargs['port']}/{kwargs['database']}?max_allowed_packet={max_packet}"
|
||||
else:
|
||||
scheme = opendal_config.get("scheme")
|
||||
config_data = opendal_config.get("config", {})
|
||||
kwargs = {"scheme": scheme, **config_data}
|
||||
logging.info("Loaded OpenDAL configuration from yaml: %s", kwargs)
|
||||
return kwargs
|
||||
except Exception as e:
|
||||
logging.error("Failed to load OpenDAL configuration from yaml: %s", str(e))
|
||||
raise
|
||||
|
||||
|
||||
@singleton
|
||||
class OpenDALStorage:
|
||||
def __init__(self):
|
||||
self._kwargs = get_opendal_config()
|
||||
self._scheme = self._kwargs.get('scheme', 'mysql')
|
||||
if self._scheme == 'mysql':
|
||||
self.init_db_config()
|
||||
self.init_opendal_mysql_table()
|
||||
self._operator = opendal.Operator(**self._kwargs)
|
||||
|
||||
logging.info("OpenDALStorage initialized successfully")
|
||||
|
||||
def health(self):
|
||||
bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
|
||||
r = self._operator.write(f"{bucket}/{fnm}", binary)
|
||||
return r
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
self._operator.write(f"{bucket}/{fnm}", binary)
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
return self._operator.read(f"{bucket}/{fnm}")
|
||||
|
||||
def rm(self, bucket, fnm):
|
||||
self._operator.delete(f"{bucket}/{fnm}")
|
||||
self._operator.__init__()
|
||||
|
||||
def scan(self, bucket, fnm):
|
||||
return self._operator.scan(f"{bucket}/{fnm}")
|
||||
|
||||
def obj_exist(self, bucket, fnm):
|
||||
return self._operator.exists(f"{bucket}/{fnm}")
|
||||
|
||||
|
||||
def init_db_config(self):
|
||||
try:
|
||||
conn = pymysql.connect(
|
||||
host=self._kwargs['host'],
|
||||
port=int(self._kwargs['port']),
|
||||
user=self._kwargs['user'],
|
||||
password=self._kwargs['password'],
|
||||
database=self._kwargs['database']
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
max_packet = self._kwargs.get('max_allowed_packet', 4194304) # Default to 4MB if not specified
|
||||
cursor.execute(SET_MAX_ALLOWED_PACKET_SQL.format(max_packet))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
logging.info(f"Database configuration initialized with max_allowed_packet={max_packet}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to initialize database configuration: {str(e)}")
|
||||
raise
|
||||
|
||||
def init_opendal_mysql_table(self):
|
||||
conn = pymysql.connect(
|
||||
host=self._kwargs['host'],
|
||||
port=int(self._kwargs['port']),
|
||||
user=self._kwargs['user'],
|
||||
password=self._kwargs['password'],
|
||||
database=self._kwargs['database']
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(CREATE_TABLE_SQL.format(self._kwargs['table']))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
logging.info(f"Table `{self._kwargs['table']}` initialized.")
|
||||
561
rag/utils/opensearch_conn.py
Normal file
561
rag/utils/opensearch_conn.py
Normal file
@@ -0,0 +1,561 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
|
||||
import copy
|
||||
from opensearchpy import OpenSearch, NotFoundError
|
||||
from opensearchpy import UpdateByQuery, Q, Search, Index
|
||||
from opensearchpy import ConnectionTimeout
|
||||
from rag import settings
|
||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||
from rag.utils import singleton
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
|
||||
FusionExpr
|
||||
from rag.nlp import is_english, rag_tokenizer
|
||||
|
||||
ATTEMPT_TIME = 2
|
||||
|
||||
logger = logging.getLogger('ragflow.opensearch_conn')
|
||||
|
||||
|
||||
@singleton
|
||||
class OSConnection(DocStoreConnection):
|
||||
def __init__(self):
|
||||
self.info = {}
|
||||
logger.info(f"Use OpenSearch {settings.OS['hosts']} as the doc engine.")
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
self.os = OpenSearch(
|
||||
settings.OS["hosts"].split(","),
|
||||
http_auth=(settings.OS["username"], settings.OS[
|
||||
"password"]) if "username" in settings.OS and "password" in settings.OS else None,
|
||||
verify_certs=False,
|
||||
timeout=600
|
||||
)
|
||||
if self.os:
|
||||
self.info = self.os.info()
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"{str(e)}. Waiting OpenSearch {settings.OS['hosts']} to be healthy.")
|
||||
time.sleep(5)
|
||||
if not self.os.ping():
|
||||
msg = f"OpenSearch {settings.OS['hosts']} is unhealthy in 120s."
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
v = self.info.get("version", {"number": "2.18.0"})
|
||||
v = v["number"].split(".")[0]
|
||||
if int(v) < 2:
|
||||
msg = f"OpenSearch version must be greater than or equal to 2, current version: {v}"
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "os_mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
msg = f"OpenSearch mapping file not found at {fp_mapping}"
|
||||
logger.error(msg)
|
||||
raise Exception(msg)
|
||||
self.mapping = json.load(open(fp_mapping, "r"))
|
||||
logger.info(f"OpenSearch {settings.OS['hosts']} is healthy.")
|
||||
|
||||
"""
|
||||
Database operations
|
||||
"""
|
||||
|
||||
def dbType(self) -> str:
|
||||
return "opensearch"
|
||||
|
||||
def health(self) -> dict:
|
||||
health_dict = dict(self.os.cluster.health())
|
||||
health_dict["type"] = "opensearch"
|
||||
return health_dict
|
||||
|
||||
"""
|
||||
Table operations
|
||||
"""
|
||||
|
||||
def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
|
||||
if self.indexExist(indexName, knowledgebaseId):
|
||||
return True
|
||||
try:
|
||||
from opensearchpy.client import IndicesClient
|
||||
return IndicesClient(self.os).create(index=indexName,
|
||||
body=self.mapping)
|
||||
except Exception:
|
||||
logger.exception("OSConnection.createIndex error %s" % (indexName))
|
||||
|
||||
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
||||
if len(knowledgebaseId) > 0:
|
||||
# The index need to be alive after any kb deletion since all kb under this tenant are in one index.
|
||||
return
|
||||
try:
|
||||
self.os.indices.delete(index=indexName, allow_no_indices=True)
|
||||
except NotFoundError:
|
||||
pass
|
||||
except Exception:
|
||||
logger.exception("OSConnection.deleteIdx error %s" % (indexName))
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str = None) -> bool:
|
||||
s = Index(indexName, self.os)
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
return s.exists()
|
||||
except Exception as e:
|
||||
logger.exception("OSConnection.indexExist got exception")
|
||||
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
|
||||
continue
|
||||
break
|
||||
return False
|
||||
|
||||
"""
|
||||
CRUD operations
|
||||
"""
|
||||
|
||||
def search(
|
||||
self, selectFields: list[str],
|
||||
highlightFields: list[str],
|
||||
condition: dict,
|
||||
matchExprs: list[MatchExpr],
|
||||
orderBy: OrderByExpr,
|
||||
offset: int,
|
||||
limit: int,
|
||||
indexNames: str | list[str],
|
||||
knowledgebaseIds: list[str],
|
||||
aggFields: list[str] = [],
|
||||
rank_feature: dict | None = None
|
||||
):
|
||||
"""
|
||||
Refers to https://github.com/opensearch-project/opensearch-py/blob/main/guides/dsl.md
|
||||
"""
|
||||
use_knn = False
|
||||
if isinstance(indexNames, str):
|
||||
indexNames = indexNames.split(",")
|
||||
assert isinstance(indexNames, list) and len(indexNames) > 0
|
||||
assert "_id" not in condition
|
||||
|
||||
bqry = Q("bool", must=[])
|
||||
condition["kb_id"] = knowledgebaseIds
|
||||
for k, v in condition.items():
|
||||
if k == "available_int":
|
||||
if v == 0:
|
||||
bqry.filter.append(Q("range", available_int={"lt": 1}))
|
||||
else:
|
||||
bqry.filter.append(
|
||||
Q("bool", must_not=Q("range", available_int={"lt": 1})))
|
||||
continue
|
||||
if not v:
|
||||
continue
|
||||
if isinstance(v, list):
|
||||
bqry.filter.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
bqry.filter.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception(
|
||||
f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
|
||||
|
||||
s = Search()
|
||||
vector_similarity_weight = 0.5
|
||||
for m in matchExprs:
|
||||
if isinstance(m, FusionExpr) and m.method == "weighted_sum" and "weights" in m.fusion_params:
|
||||
assert len(matchExprs) == 3 and isinstance(matchExprs[0], MatchTextExpr) and isinstance(matchExprs[1],
|
||||
MatchDenseExpr) and isinstance(
|
||||
matchExprs[2], FusionExpr)
|
||||
weights = m.fusion_params["weights"]
|
||||
vector_similarity_weight = float(weights.split(",")[1])
|
||||
knn_query = {}
|
||||
for m in matchExprs:
|
||||
if isinstance(m, MatchTextExpr):
|
||||
minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)
|
||||
if isinstance(minimum_should_match, float):
|
||||
minimum_should_match = str(int(minimum_should_match * 100)) + "%"
|
||||
bqry.must.append(Q("query_string", fields=m.fields,
|
||||
type="best_fields", query=m.matching_text,
|
||||
minimum_should_match=minimum_should_match,
|
||||
boost=1))
|
||||
bqry.boost = 1.0 - vector_similarity_weight
|
||||
|
||||
# Elasticsearch has the encapsulation of KNN_search in python sdk
|
||||
# while the Python SDK for OpenSearch does not provide encapsulation for KNN_search,
|
||||
# the following codes implement KNN_search in OpenSearch using DSL
|
||||
# Besides, Opensearch's DSL for KNN_search query syntax differs from that in Elasticsearch, I also made some adaptions for it
|
||||
elif isinstance(m, MatchDenseExpr):
|
||||
assert (bqry is not None)
|
||||
similarity = 0.0
|
||||
if "similarity" in m.extra_options:
|
||||
similarity = m.extra_options["similarity"]
|
||||
use_knn = True
|
||||
vector_column_name = m.vector_column_name
|
||||
knn_query[vector_column_name] = {}
|
||||
knn_query[vector_column_name]["vector"] = list(m.embedding_data)
|
||||
knn_query[vector_column_name]["k"] = m.topn
|
||||
knn_query[vector_column_name]["filter"] = bqry.to_dict()
|
||||
knn_query[vector_column_name]["boost"] = similarity
|
||||
|
||||
if bqry and rank_feature:
|
||||
for fld, sc in rank_feature.items():
|
||||
if fld != PAGERANK_FLD:
|
||||
fld = f"{TAG_FLD}.{fld}"
|
||||
bqry.should.append(Q("rank_feature", field=fld, linear={}, boost=sc))
|
||||
|
||||
if bqry:
|
||||
s = s.query(bqry)
|
||||
for field in highlightFields:
|
||||
s = s.highlight(field,force_source=True,no_match_size=30,require_field_match=False)
|
||||
|
||||
if orderBy:
|
||||
orders = list()
|
||||
for field, order in orderBy.fields:
|
||||
order = "asc" if order == 0 else "desc"
|
||||
if field in ["page_num_int", "top_int"]:
|
||||
order_info = {"order": order, "unmapped_type": "float",
|
||||
"mode": "avg", "numeric_type": "double"}
|
||||
elif field.endswith("_int") or field.endswith("_flt"):
|
||||
order_info = {"order": order, "unmapped_type": "float"}
|
||||
else:
|
||||
order_info = {"order": order, "unmapped_type": "text"}
|
||||
orders.append({field: order_info})
|
||||
s = s.sort(*orders)
|
||||
|
||||
for fld in aggFields:
|
||||
s.aggs.bucket(f'aggs_{fld}', 'terms', field=fld, size=1000000)
|
||||
|
||||
if limit > 0:
|
||||
s = s[offset:offset + limit]
|
||||
q = s.to_dict()
|
||||
logger.debug(f"OSConnection.search {str(indexNames)} query: " + json.dumps(q))
|
||||
|
||||
if use_knn:
|
||||
del q["query"]
|
||||
q["query"] = {"knn" : knn_query}
|
||||
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = self.os.search(index=indexNames,
|
||||
body=q,
|
||||
timeout=600,
|
||||
# search_type="dfs_query_then_fetch",
|
||||
track_total_hits=True,
|
||||
_source=True)
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("OpenSearch Timeout.")
|
||||
logger.debug(f"OSConnection.search {str(indexNames)} res: " + str(res))
|
||||
return res
|
||||
except Exception as e:
|
||||
logger.exception(f"OSConnection.search {str(indexNames)} query: " + str(q))
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
raise e
|
||||
logger.error(f"OSConnection.search timeout for {ATTEMPT_TIME} times!")
|
||||
raise Exception("OSConnection.search timeout.")
|
||||
|
||||
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = self.os.get(index=(indexName),
|
||||
id=chunkId, _source=True, )
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("Es Timeout.")
|
||||
chunk = res["_source"]
|
||||
chunk["id"] = chunkId
|
||||
return chunk
|
||||
except NotFoundError:
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.exception(f"OSConnection.get({chunkId}) got exception")
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
raise e
|
||||
logger.error(f"OSConnection.get timeout for {ATTEMPT_TIME} times!")
|
||||
raise Exception("OSConnection.get timeout.")
|
||||
|
||||
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
|
||||
# Refers to https://opensearch.org/docs/latest/api-reference/document-apis/bulk/
|
||||
operations = []
|
||||
for d in documents:
|
||||
assert "_id" not in d
|
||||
assert "id" in d
|
||||
d_copy = copy.deepcopy(d)
|
||||
meta_id = d_copy.pop("id", "")
|
||||
operations.append(
|
||||
{"index": {"_index": indexName, "_id": meta_id}})
|
||||
operations.append(d_copy)
|
||||
|
||||
res = []
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = []
|
||||
r = self.os.bulk(index=(indexName), body=operations,
|
||||
refresh=False, timeout=60)
|
||||
if re.search(r"False", str(r["errors"]), re.IGNORECASE):
|
||||
return res
|
||||
|
||||
for item in r["items"]:
|
||||
for action in ["create", "delete", "index", "update"]:
|
||||
if action in item and "error" in item[action]:
|
||||
res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
|
||||
return res
|
||||
except Exception as e:
|
||||
res.append(str(e))
|
||||
logger.warning("OSConnection.insert got exception: " + str(e))
|
||||
res = []
|
||||
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
|
||||
res.append(str(e))
|
||||
time.sleep(3)
|
||||
continue
|
||||
return res
|
||||
|
||||
def update(self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str) -> bool:
|
||||
doc = copy.deepcopy(newValue)
|
||||
doc.pop("id", None)
|
||||
if "id" in condition and isinstance(condition["id"], str):
|
||||
# update specific single document
|
||||
chunkId = condition["id"]
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
self.os.update(index=indexName, id=chunkId, body={"doc":doc})
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"OSConnection.update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)}) got exception")
|
||||
if re.search(r"(timeout|connection)", str(e).lower()):
|
||||
continue
|
||||
break
|
||||
return False
|
||||
|
||||
# update unspecific maybe-multiple documents
|
||||
bqry = Q("bool")
|
||||
for k, v in condition.items():
|
||||
if not isinstance(k, str) or not v:
|
||||
continue
|
||||
if k == "exists":
|
||||
bqry.filter.append(Q("exists", field=v))
|
||||
continue
|
||||
if isinstance(v, list):
|
||||
bqry.filter.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
bqry.filter.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception(
|
||||
f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
|
||||
scripts = []
|
||||
params = {}
|
||||
for k, v in newValue.items():
|
||||
if k == "remove":
|
||||
if isinstance(v, str):
|
||||
scripts.append(f"ctx._source.remove('{v}');")
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
scripts.append(f"int i=ctx._source.{kk}.indexOf(params.p_{kk});ctx._source.{kk}.remove(i);")
|
||||
params[f"p_{kk}"] = vv
|
||||
continue
|
||||
if k == "add":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
scripts.append(f"ctx._source.{kk}.add(params.pp_{kk});")
|
||||
params[f"pp_{kk}"] = vv.strip()
|
||||
continue
|
||||
if (not isinstance(k, str) or not v) and k != "available_int":
|
||||
continue
|
||||
if isinstance(v, str):
|
||||
v = re.sub(r"(['\n\r]|\\.)", " ", v)
|
||||
params[f"pp_{k}"] = v
|
||||
scripts.append(f"ctx._source.{k}=params.pp_{k};")
|
||||
elif isinstance(v, int) or isinstance(v, float):
|
||||
scripts.append(f"ctx._source.{k}={v};")
|
||||
elif isinstance(v, list):
|
||||
scripts.append(f"ctx._source.{k}=params.pp_{k};")
|
||||
params[f"pp_{k}"] = json.dumps(v, ensure_ascii=False)
|
||||
else:
|
||||
raise Exception(
|
||||
f"newValue `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str.")
|
||||
ubq = UpdateByQuery(
|
||||
index=indexName).using(
|
||||
self.os).query(bqry)
|
||||
ubq = ubq.script(source="".join(scripts), params=params)
|
||||
ubq = ubq.params(refresh=True)
|
||||
ubq = ubq.params(slices=5)
|
||||
ubq = ubq.params(conflicts="proceed")
|
||||
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
_ = ubq.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("OSConnection.update got exception: " + str(e) + "\n".join(scripts))
|
||||
if re.search(r"(timeout|connection|conflict)", str(e).lower()):
|
||||
continue
|
||||
break
|
||||
return False
|
||||
|
||||
def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
|
||||
qry = None
|
||||
assert "_id" not in condition
|
||||
if "id" in condition:
|
||||
chunk_ids = condition["id"]
|
||||
if not isinstance(chunk_ids, list):
|
||||
chunk_ids = [chunk_ids]
|
||||
if not chunk_ids: # when chunk_ids is empty, delete all
|
||||
qry = Q("match_all")
|
||||
else:
|
||||
qry = Q("ids", values=chunk_ids)
|
||||
else:
|
||||
qry = Q("bool")
|
||||
for k, v in condition.items():
|
||||
if k == "exists":
|
||||
qry.filter.append(Q("exists", field=v))
|
||||
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
qry.must_not.append(Q("exists", field=vv))
|
||||
|
||||
elif isinstance(v, list):
|
||||
qry.must.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
qry.must.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
logger.debug("OSConnection.delete query: " + json.dumps(qry.to_dict()))
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
#print(Search().query(qry).to_dict(), flush=True)
|
||||
res = self.os.delete_by_query(
|
||||
index=indexName,
|
||||
body=Search().query(qry).to_dict(),
|
||||
refresh=True)
|
||||
return res["deleted"]
|
||||
except Exception as e:
|
||||
logger.warning("OSConnection.delete got exception: " + str(e))
|
||||
if re.search(r"(timeout|connection)", str(e).lower()):
|
||||
time.sleep(3)
|
||||
continue
|
||||
if re.search(r"(not_found)", str(e), re.IGNORECASE):
|
||||
return 0
|
||||
return 0
|
||||
|
||||
"""
|
||||
Helper functions for search result
|
||||
"""
|
||||
|
||||
def getTotal(self, res):
|
||||
if isinstance(res["hits"]["total"], type({})):
|
||||
return res["hits"]["total"]["value"]
|
||||
return res["hits"]["total"]
|
||||
|
||||
def getChunkIds(self, res):
|
||||
return [d["_id"] for d in res["hits"]["hits"]]
|
||||
|
||||
def __getSource(self, res):
|
||||
rr = []
|
||||
for d in res["hits"]["hits"]:
|
||||
d["_source"]["id"] = d["_id"]
|
||||
d["_source"]["_score"] = d["_score"]
|
||||
rr.append(d["_source"])
|
||||
return rr
|
||||
|
||||
def getFields(self, res, fields: list[str]) -> dict[str, dict]:
|
||||
res_fields = {}
|
||||
if not fields:
|
||||
return {}
|
||||
for d in self.__getSource(res):
|
||||
m = {n: d.get(n) for n in fields if d.get(n) is not None}
|
||||
for n, v in m.items():
|
||||
if isinstance(v, list):
|
||||
m[n] = v
|
||||
continue
|
||||
if not isinstance(v, str):
|
||||
m[n] = str(m[n])
|
||||
# if n.find("tks") > 0:
|
||||
# m[n] = rmSpace(m[n])
|
||||
|
||||
if m:
|
||||
res_fields[d["id"]] = m
|
||||
return res_fields
|
||||
|
||||
def getHighlight(self, res, keywords: list[str], fieldnm: str):
|
||||
ans = {}
|
||||
for d in res["hits"]["hits"]:
|
||||
hlts = d.get("highlight")
|
||||
if not hlts:
|
||||
continue
|
||||
txt = "...".join([a for a in list(hlts.items())[0][1]])
|
||||
if not is_english(txt.split()):
|
||||
ans[d["_id"]] = txt
|
||||
continue
|
||||
|
||||
txt = d["_source"][fieldnm]
|
||||
txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
|
||||
txts = []
|
||||
for t in re.split(r"[.?!;\n]", txt):
|
||||
for w in keywords:
|
||||
t = re.sub(r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])" % re.escape(w), r"\1<em>\2</em>\3", t,
|
||||
flags=re.IGNORECASE | re.MULTILINE)
|
||||
if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE):
|
||||
continue
|
||||
txts.append(t)
|
||||
ans[d["_id"]] = "...".join(txts) if txts else "...".join([a for a in list(hlts.items())[0][1]])
|
||||
|
||||
return ans
|
||||
|
||||
def getAggregation(self, res, fieldnm: str):
|
||||
agg_field = "aggs_" + fieldnm
|
||||
if "aggregations" not in res or agg_field not in res["aggregations"]:
|
||||
return list()
|
||||
bkts = res["aggregations"][agg_field]["buckets"]
|
||||
return [(b["key"], b["doc_count"]) for b in bkts]
|
||||
|
||||
"""
|
||||
SQL
|
||||
"""
|
||||
|
||||
def sql(self, sql: str, fetch_size: int, format: str):
|
||||
logger.debug(f"OSConnection.sql get sql: {sql}")
|
||||
sql = re.sub(r"[ `]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
replaces = []
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(3)
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
|
||||
replaces.append(
|
||||
("{}{}'{}'".format(
|
||||
r.group(1),
|
||||
r.group(2),
|
||||
r.group(3)),
|
||||
match))
|
||||
|
||||
for p, r in replaces:
|
||||
sql = sql.replace(p, r, 1)
|
||||
logger.debug(f"OSConnection.sql to os: {sql}")
|
||||
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
res = self.os.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format,
|
||||
request_timeout="2s")
|
||||
return res
|
||||
except ConnectionTimeout:
|
||||
logger.exception("OSConnection.sql timeout")
|
||||
continue
|
||||
except Exception:
|
||||
logger.exception("OSConnection.sql got exception")
|
||||
return None
|
||||
logger.error(f"OSConnection.sql timeout for {ATTEMPT_TIME} times!")
|
||||
return None
|
||||
174
rag/utils/oss_conn.py
Normal file
174
rag/utils/oss_conn.py
Normal file
@@ -0,0 +1,174 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from botocore.config import Config
|
||||
import time
|
||||
from io import BytesIO
|
||||
from rag.utils import singleton
|
||||
from rag import settings
|
||||
|
||||
|
||||
@singleton
|
||||
class RAGFlowOSS:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.oss_config = settings.OSS
|
||||
self.access_key = self.oss_config.get('access_key', None)
|
||||
self.secret_key = self.oss_config.get('secret_key', None)
|
||||
self.endpoint_url = self.oss_config.get('endpoint_url', None)
|
||||
self.region = self.oss_config.get('region', None)
|
||||
self.bucket = self.oss_config.get('bucket', None)
|
||||
self.prefix_path = self.oss_config.get('prefix_path', None)
|
||||
self.__open__()
|
||||
|
||||
@staticmethod
|
||||
def use_default_bucket(method):
|
||||
def wrapper(self, bucket, *args, **kwargs):
|
||||
# If there is a default bucket, use the default bucket
|
||||
actual_bucket = self.bucket if self.bucket else bucket
|
||||
return method(self, actual_bucket, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
@staticmethod
|
||||
def use_prefix_path(method):
|
||||
def wrapper(self, bucket, fnm, *args, **kwargs):
|
||||
# If the prefix path is set, use the prefix path
|
||||
fnm = f"{self.prefix_path}/{fnm}" if self.prefix_path else fnm
|
||||
return method(self, bucket, fnm, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Reference:https://help.aliyun.com/zh/oss/developer-reference/use-amazon-s3-sdks-to-access-oss
|
||||
self.conn = boto3.client(
|
||||
's3',
|
||||
region_name=self.region,
|
||||
aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key,
|
||||
endpoint_url=self.endpoint_url,
|
||||
config=Config(s3={"addressing_style": "virtual"}, signature_version='v4')
|
||||
)
|
||||
except Exception:
|
||||
logging.exception(f"Fail to connect at region {self.region}")
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
self.conn = None
|
||||
|
||||
@use_default_bucket
|
||||
def bucket_exists(self, bucket):
|
||||
try:
|
||||
logging.debug(f"head_bucket bucketname {bucket}")
|
||||
self.conn.head_bucket(Bucket=bucket)
|
||||
exists = True
|
||||
except ClientError:
|
||||
logging.exception(f"head_bucket error {bucket}")
|
||||
exists = False
|
||||
return exists
|
||||
|
||||
def health(self):
|
||||
bucket = self.bucket
|
||||
fnm = "txtxtxtxt1"
|
||||
fnm, binary = f"{self.prefix_path}/{fnm}" if self.prefix_path else fnm, b"_t@@@1"
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn.create_bucket(Bucket=bucket)
|
||||
logging.debug(f"create bucket {bucket} ********")
|
||||
|
||||
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
return r
|
||||
|
||||
def get_properties(self, bucket, key):
|
||||
return {}
|
||||
|
||||
def list(self, bucket, dir, recursive=True):
|
||||
return []
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def put(self, bucket, fnm, binary):
|
||||
logging.debug(f"bucket name {bucket}; filename :{fnm}:")
|
||||
for _ in range(1):
|
||||
try:
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn.create_bucket(Bucket=bucket)
|
||||
logging.info(f"create bucket {bucket} ********")
|
||||
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
|
||||
return r
|
||||
except Exception:
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def rm(self, bucket, fnm):
|
||||
try:
|
||||
self.conn.delete_object(Bucket=bucket, Key=fnm)
|
||||
except Exception:
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
try:
|
||||
r = self.conn.get_object(Bucket=bucket, Key=fnm)
|
||||
object_data = r['Body'].read()
|
||||
return object_data
|
||||
except Exception:
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def obj_exist(self, bucket, fnm):
|
||||
try:
|
||||
if self.conn.head_object(Bucket=bucket, Key=fnm):
|
||||
return True
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
return False
|
||||
else:
|
||||
raise
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
r = self.conn.generate_presigned_url('get_object',
|
||||
Params={'Bucket': bucket,
|
||||
'Key': fnm},
|
||||
ExpiresIn=expires)
|
||||
|
||||
return r
|
||||
except Exception:
|
||||
logging.exception(f"fail get url {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
393
rag/utils/redis_conn.py
Normal file
393
rag/utils/redis_conn.py
Normal file
@@ -0,0 +1,393 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import json
|
||||
import uuid
|
||||
|
||||
import valkey as redis
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
from valkey.lock import Lock
|
||||
import trio
|
||||
|
||||
class RedisMsg:
|
||||
def __init__(self, consumer, queue_name, group_name, msg_id, message):
|
||||
self.__consumer = consumer
|
||||
self.__queue_name = queue_name
|
||||
self.__group_name = group_name
|
||||
self.__msg_id = msg_id
|
||||
self.__message = json.loads(message["message"])
|
||||
|
||||
def ack(self):
|
||||
try:
|
||||
self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e))
|
||||
return False
|
||||
|
||||
def get_message(self):
|
||||
return self.__message
|
||||
|
||||
def get_msg_id(self):
|
||||
return self.__msg_id
|
||||
|
||||
|
||||
@singleton
|
||||
class RedisDB:
|
||||
lua_delete_if_equal = None
|
||||
LUA_DELETE_IF_EQUAL_SCRIPT = """
|
||||
local current_value = redis.call('get', KEYS[1])
|
||||
if current_value and current_value == ARGV[1] then
|
||||
redis.call('del', KEYS[1])
|
||||
return 1
|
||||
end
|
||||
return 0
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.REDIS = None
|
||||
self.config = settings.REDIS
|
||||
self.__open__()
|
||||
|
||||
def register_scripts(self) -> None:
|
||||
cls = self.__class__
|
||||
client = self.REDIS
|
||||
cls.lua_delete_if_equal = client.register_script(cls.LUA_DELETE_IF_EQUAL_SCRIPT)
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
self.REDIS = redis.StrictRedis(
|
||||
host=self.config["host"].split(":")[0],
|
||||
port=int(self.config.get("host", ":6379").split(":")[1]),
|
||||
db=int(self.config.get("db", 1)),
|
||||
password=self.config.get("password"),
|
||||
decode_responses=True,
|
||||
)
|
||||
self.register_scripts()
|
||||
except Exception:
|
||||
logging.warning("Redis can't be connected.")
|
||||
return self.REDIS
|
||||
|
||||
def health(self):
|
||||
self.REDIS.ping()
|
||||
a, b = "xx", "yy"
|
||||
self.REDIS.set(a, b, 3)
|
||||
|
||||
if self.REDIS.get(a) == b:
|
||||
return True
|
||||
|
||||
def info(self):
|
||||
info = self.REDIS.info()
|
||||
return {
|
||||
'redis_version': info["redis_version"],
|
||||
'server_mode': info["server_mode"],
|
||||
'used_memory': info["used_memory_human"],
|
||||
'total_system_memory': info["total_system_memory_human"],
|
||||
'mem_fragmentation_ratio': info["mem_fragmentation_ratio"],
|
||||
'connected_clients': info["connected_clients"],
|
||||
'blocked_clients': info["blocked_clients"],
|
||||
'instantaneous_ops_per_sec': info["instantaneous_ops_per_sec"],
|
||||
'total_commands_processed': info["total_commands_processed"]
|
||||
}
|
||||
|
||||
def is_alive(self):
|
||||
return self.REDIS is not None
|
||||
|
||||
def exist(self, k):
|
||||
if not self.REDIS:
|
||||
return
|
||||
try:
|
||||
return self.REDIS.exists(k)
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.exist " + str(k) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
|
||||
def get(self, k):
|
||||
if not self.REDIS:
|
||||
return
|
||||
try:
|
||||
return self.REDIS.get(k)
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.get " + str(k) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
|
||||
def set_obj(self, k, obj, exp=3600):
|
||||
try:
|
||||
self.REDIS.set(k, json.dumps(obj, ensure_ascii=False), exp)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.set_obj " + str(k) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def set(self, k, v, exp=3600):
|
||||
try:
|
||||
self.REDIS.set(k, v, exp)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.set " + str(k) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def sadd(self, key: str, member: str):
|
||||
try:
|
||||
self.REDIS.sadd(key, member)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.sadd " + str(key) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def srem(self, key: str, member: str):
|
||||
try:
|
||||
self.REDIS.srem(key, member)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.srem " + str(key) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def smembers(self, key: str):
|
||||
try:
|
||||
res = self.REDIS.smembers(key)
|
||||
return res
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
"RedisDB.smembers " + str(key) + " got exception: " + str(e)
|
||||
)
|
||||
self.__open__()
|
||||
return None
|
||||
|
||||
def zadd(self, key: str, member: str, score: float):
|
||||
try:
|
||||
self.REDIS.zadd(key, {member: score})
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.zadd " + str(key) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def zcount(self, key: str, min: float, max: float):
|
||||
try:
|
||||
res = self.REDIS.zcount(key, min, max)
|
||||
return res
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.zcount " + str(key) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return 0
|
||||
|
||||
def zpopmin(self, key: str, count: int):
|
||||
try:
|
||||
res = self.REDIS.zpopmin(key, count)
|
||||
return res
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.zpopmin " + str(key) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return None
|
||||
|
||||
def zrangebyscore(self, key: str, min: float, max: float):
|
||||
try:
|
||||
res = self.REDIS.zrangebyscore(key, min, max)
|
||||
return res
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
"RedisDB.zrangebyscore " + str(key) + " got exception: " + str(e)
|
||||
)
|
||||
self.__open__()
|
||||
return None
|
||||
|
||||
def transaction(self, key, value, exp=3600):
|
||||
try:
|
||||
pipeline = self.REDIS.pipeline(transaction=True)
|
||||
pipeline.set(key, value, exp, nx=True)
|
||||
pipeline.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
"RedisDB.transaction " + str(key) + " got exception: " + str(e)
|
||||
)
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def queue_product(self, queue, message) -> bool:
|
||||
for _ in range(3):
|
||||
try:
|
||||
payload = {"message": json.dumps(message)}
|
||||
self.REDIS.xadd(queue, payload)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.exception(
|
||||
"RedisDB.queue_product " + str(queue) + " got exception: " + str(e)
|
||||
)
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> RedisMsg:
|
||||
"""https://redis.io/docs/latest/commands/xreadgroup/"""
|
||||
for _ in range(3):
|
||||
try:
|
||||
|
||||
try:
|
||||
group_info = self.REDIS.xinfo_groups(queue_name)
|
||||
if not any(gi["name"] == group_name for gi in group_info):
|
||||
self.REDIS.xgroup_create(queue_name, group_name, id="0", mkstream=True)
|
||||
except redis.exceptions.ResponseError as e:
|
||||
if "no such key" in str(e).lower():
|
||||
self.REDIS.xgroup_create(queue_name, group_name, id="0", mkstream=True)
|
||||
elif "busygroup" in str(e).lower():
|
||||
logging.warning("Group already exists, continue.")
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
args = {
|
||||
"groupname": group_name,
|
||||
"consumername": consumer_name,
|
||||
"count": 1,
|
||||
"block": 5,
|
||||
"streams": {queue_name: msg_id},
|
||||
}
|
||||
messages = self.REDIS.xreadgroup(**args)
|
||||
if not messages:
|
||||
return None
|
||||
stream, element_list = messages[0]
|
||||
if not element_list:
|
||||
return None
|
||||
msg_id, payload = element_list[0]
|
||||
res = RedisMsg(self.REDIS, queue_name, group_name, msg_id, payload)
|
||||
return res
|
||||
except Exception as e:
|
||||
if str(e) == 'no such key':
|
||||
pass
|
||||
else:
|
||||
logging.exception(
|
||||
"RedisDB.queue_consumer "
|
||||
+ str(queue_name)
|
||||
+ " got exception: "
|
||||
+ str(e)
|
||||
)
|
||||
self.__open__()
|
||||
return None
|
||||
|
||||
def get_unacked_iterator(self, queue_names: list[str], group_name, consumer_name):
|
||||
try:
|
||||
for queue_name in queue_names:
|
||||
try:
|
||||
group_info = self.REDIS.xinfo_groups(queue_name)
|
||||
except Exception as e:
|
||||
if str(e) == 'no such key':
|
||||
logging.warning(f"RedisDB.get_unacked_iterator queue {queue_name} doesn't exist")
|
||||
continue
|
||||
if not any(gi["name"] == group_name for gi in group_info):
|
||||
logging.warning(f"RedisDB.get_unacked_iterator queue {queue_name} group {group_name} doesn't exist")
|
||||
continue
|
||||
current_min = 0
|
||||
while True:
|
||||
payload = self.queue_consumer(queue_name, group_name, consumer_name, current_min)
|
||||
if not payload:
|
||||
break
|
||||
current_min = payload.get_msg_id()
|
||||
logging.info(f"RedisDB.get_unacked_iterator {queue_name} {consumer_name} {current_min}")
|
||||
yield payload
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"RedisDB.get_unacked_iterator got exception: "
|
||||
)
|
||||
self.__open__()
|
||||
|
||||
def get_pending_msg(self, queue, group_name):
|
||||
try:
|
||||
messages = self.REDIS.xpending_range(queue, group_name, '-', '+', 10)
|
||||
return messages
|
||||
except Exception as e:
|
||||
if 'No such key' not in (str(e) or ''):
|
||||
logging.warning(
|
||||
"RedisDB.get_pending_msg " + str(queue) + " got exception: " + str(e)
|
||||
)
|
||||
return []
|
||||
|
||||
def requeue_msg(self, queue: str, group_name: str, msg_id: str):
|
||||
for _ in range(3):
|
||||
try:
|
||||
messages = self.REDIS.xrange(queue, msg_id, msg_id)
|
||||
if messages:
|
||||
self.REDIS.xadd(queue, messages[0][1])
|
||||
self.REDIS.xack(queue, group_name, msg_id)
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
"RedisDB.get_pending_msg " + str(queue) + " got exception: " + str(e)
|
||||
)
|
||||
self.__open__()
|
||||
|
||||
def queue_info(self, queue, group_name) -> dict | None:
|
||||
for _ in range(3):
|
||||
try:
|
||||
groups = self.REDIS.xinfo_groups(queue)
|
||||
for group in groups:
|
||||
if group["name"] == group_name:
|
||||
return group
|
||||
except Exception as e:
|
||||
logging.warning(
|
||||
"RedisDB.queue_info " + str(queue) + " got exception: " + str(e)
|
||||
)
|
||||
self.__open__()
|
||||
return None
|
||||
|
||||
def delete_if_equal(self, key: str, expected_value: str) -> bool:
|
||||
"""
|
||||
Do following atomically:
|
||||
Delete a key if its value is equals to the given one, do nothing otherwise.
|
||||
"""
|
||||
return bool(self.lua_delete_if_equal(keys=[key], args=[expected_value], client=self.REDIS))
|
||||
|
||||
def delete(self, key) -> bool:
|
||||
try:
|
||||
self.REDIS.delete(key)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("RedisDB.delete " + str(key) + " got exception: " + str(e))
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
|
||||
REDIS_CONN = RedisDB()
|
||||
|
||||
|
||||
class RedisDistributedLock:
|
||||
def __init__(self, lock_key, lock_value=None, timeout=10, blocking_timeout=1):
|
||||
self.lock_key = lock_key
|
||||
if lock_value:
|
||||
self.lock_value = lock_value
|
||||
else:
|
||||
self.lock_value = str(uuid.uuid4())
|
||||
self.timeout = timeout
|
||||
self.lock = Lock(REDIS_CONN.REDIS, lock_key, timeout=timeout, blocking_timeout=blocking_timeout)
|
||||
|
||||
def acquire(self):
|
||||
REDIS_CONN.delete_if_equal(self.lock_key, self.lock_value)
|
||||
return self.lock.acquire(token=self.lock_value)
|
||||
|
||||
async def spin_acquire(self):
|
||||
REDIS_CONN.delete_if_equal(self.lock_key, self.lock_value)
|
||||
while True:
|
||||
if self.lock.acquire(token=self.lock_value):
|
||||
break
|
||||
await trio.sleep(10)
|
||||
|
||||
def release(self):
|
||||
REDIS_CONN.delete_if_equal(self.lock_key, self.lock_value)
|
||||
208
rag/utils/s3_conn.py
Normal file
208
rag/utils/s3_conn.py
Normal file
@@ -0,0 +1,208 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from botocore.config import Config
|
||||
import time
|
||||
from io import BytesIO
|
||||
from rag.utils import singleton
|
||||
from rag import settings
|
||||
|
||||
@singleton
|
||||
class RAGFlowS3:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.s3_config = settings.S3
|
||||
self.access_key = self.s3_config.get('access_key', None)
|
||||
self.secret_key = self.s3_config.get('secret_key', None)
|
||||
self.session_token = self.s3_config.get('session_token', None)
|
||||
self.region_name = self.s3_config.get('region_name', None)
|
||||
self.endpoint_url = self.s3_config.get('endpoint_url', None)
|
||||
self.signature_version = self.s3_config.get('signature_version', None)
|
||||
self.addressing_style = self.s3_config.get('addressing_style', None)
|
||||
self.bucket = self.s3_config.get('bucket', None)
|
||||
self.prefix_path = self.s3_config.get('prefix_path', None)
|
||||
self.__open__()
|
||||
|
||||
@staticmethod
|
||||
def use_default_bucket(method):
|
||||
def wrapper(self, bucket, *args, **kwargs):
|
||||
# If there is a default bucket, use the default bucket
|
||||
actual_bucket = self.bucket if self.bucket else bucket
|
||||
return method(self, actual_bucket, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
@staticmethod
|
||||
def use_prefix_path(method):
|
||||
def wrapper(self, bucket, fnm, *args, **kwargs):
|
||||
# If the prefix path is set, use the prefix path.
|
||||
# The bucket passed from the upstream call is
|
||||
# used as the file prefix. This is especially useful when you're using the default bucket
|
||||
if self.prefix_path:
|
||||
fnm = f"{self.prefix_path}/{bucket}/{fnm}"
|
||||
return method(self, bucket, fnm, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
if self.conn:
|
||||
self.__close__()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
s3_params = {}
|
||||
config_kwargs = {}
|
||||
# if not set ak/sk, boto3 s3 client would try several ways to do the authentication
|
||||
# see doc: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials
|
||||
if self.access_key and self.secret_key:
|
||||
s3_params = {
|
||||
'aws_access_key_id': self.access_key,
|
||||
'aws_secret_access_key': self.secret_key,
|
||||
'aws_session_token': self.session_token,
|
||||
}
|
||||
if self.region_name:
|
||||
s3_params['region_name'] = self.region_name
|
||||
if self.endpoint_url:
|
||||
s3_params['endpoint_url'] = self.endpoint_url
|
||||
|
||||
# Configure signature_version and addressing_style through Config object
|
||||
if self.signature_version:
|
||||
config_kwargs['signature_version'] = self.signature_version
|
||||
if self.addressing_style:
|
||||
config_kwargs['s3'] = {'addressing_style': self.addressing_style}
|
||||
|
||||
if config_kwargs:
|
||||
s3_params['config'] = Config(**config_kwargs)
|
||||
|
||||
self.conn = [boto3.client('s3', **s3_params)]
|
||||
except Exception:
|
||||
logging.exception(f"Fail to connect at region {self.region_name} or endpoint {self.endpoint_url}")
|
||||
|
||||
def __close__(self):
|
||||
del self.conn[0]
|
||||
self.conn = None
|
||||
|
||||
@use_default_bucket
|
||||
def bucket_exists(self, bucket, *args, **kwargs):
|
||||
try:
|
||||
logging.debug(f"head_bucket bucketname {bucket}")
|
||||
self.conn[0].head_bucket(Bucket=bucket)
|
||||
exists = True
|
||||
except ClientError:
|
||||
logging.exception(f"head_bucket error {bucket}")
|
||||
exists = False
|
||||
return exists
|
||||
|
||||
def health(self):
|
||||
bucket = self.bucket
|
||||
fnm = "txtxtxtxt1"
|
||||
fnm, binary = f"{self.prefix_path}/{fnm}" if self.prefix_path else fnm, b"_t@@@1"
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn[0].create_bucket(Bucket=bucket)
|
||||
logging.debug(f"create bucket {bucket} ********")
|
||||
|
||||
r = self.conn[0].upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
return r
|
||||
|
||||
def get_properties(self, bucket, key):
|
||||
return {}
|
||||
|
||||
def list(self, bucket, dir, recursive=True):
|
||||
return []
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def put(self, bucket, fnm, binary, *args, **kwargs):
|
||||
logging.debug(f"bucket name {bucket}; filename :{fnm}:")
|
||||
for _ in range(1):
|
||||
try:
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn[0].create_bucket(Bucket=bucket)
|
||||
logging.info(f"create bucket {bucket} ********")
|
||||
r = self.conn[0].upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
|
||||
return r
|
||||
except Exception:
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def rm(self, bucket, fnm, *args, **kwargs):
|
||||
try:
|
||||
self.conn[0].delete_object(Bucket=bucket, Key=fnm)
|
||||
except Exception:
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def get(self, bucket, fnm, *args, **kwargs):
|
||||
for _ in range(1):
|
||||
try:
|
||||
r = self.conn[0].get_object(Bucket=bucket, Key=fnm)
|
||||
object_data = r['Body'].read()
|
||||
return object_data
|
||||
except Exception:
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def obj_exist(self, bucket, fnm, *args, **kwargs):
|
||||
try:
|
||||
if self.conn[0].head_object(Bucket=bucket, Key=fnm):
|
||||
return True
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
return False
|
||||
else:
|
||||
raise
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def get_presigned_url(self, bucket, fnm, expires, *args, **kwargs):
|
||||
for _ in range(10):
|
||||
try:
|
||||
r = self.conn[0].generate_presigned_url('get_object',
|
||||
Params={'Bucket': bucket,
|
||||
'Key': fnm},
|
||||
ExpiresIn=expires)
|
||||
|
||||
return r
|
||||
except Exception:
|
||||
logging.exception(f"fail get url {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
@use_default_bucket
|
||||
def rm_bucket(self, bucket, *args, **kwargs):
|
||||
for conn in self.conn:
|
||||
try:
|
||||
if not conn.bucket_exists(bucket):
|
||||
continue
|
||||
for o in conn.list_objects_v2(Bucket=bucket):
|
||||
conn.delete_object(bucket, o.object_name)
|
||||
conn.delete_bucket(Bucket=bucket)
|
||||
return
|
||||
except Exception as e:
|
||||
logging.error(f"Fail rm {bucket}: " + str(e))
|
||||
53
rag/utils/storage_factory.py
Normal file
53
rag/utils/storage_factory.py
Normal file
@@ -0,0 +1,53 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
from enum import Enum
|
||||
|
||||
from rag.utils.azure_sas_conn import RAGFlowAzureSasBlob
|
||||
from rag.utils.azure_spn_conn import RAGFlowAzureSpnBlob
|
||||
from rag.utils.minio_conn import RAGFlowMinio
|
||||
from rag.utils.opendal_conn import OpenDALStorage
|
||||
from rag.utils.s3_conn import RAGFlowS3
|
||||
from rag.utils.oss_conn import RAGFlowOSS
|
||||
|
||||
|
||||
class Storage(Enum):
|
||||
MINIO = 1
|
||||
AZURE_SPN = 2
|
||||
AZURE_SAS = 3
|
||||
AWS_S3 = 4
|
||||
OSS = 5
|
||||
OPENDAL = 6
|
||||
|
||||
|
||||
class StorageFactory:
|
||||
storage_mapping = {
|
||||
Storage.MINIO: RAGFlowMinio,
|
||||
Storage.AZURE_SPN: RAGFlowAzureSpnBlob,
|
||||
Storage.AZURE_SAS: RAGFlowAzureSasBlob,
|
||||
Storage.AWS_S3: RAGFlowS3,
|
||||
Storage.OSS: RAGFlowOSS,
|
||||
Storage.OPENDAL: OpenDALStorage
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create(cls, storage: Storage):
|
||||
return cls.storage_mapping[storage]()
|
||||
|
||||
|
||||
STORAGE_IMPL_TYPE = os.getenv('STORAGE_IMPL', 'MINIO')
|
||||
STORAGE_IMPL = StorageFactory.create(Storage[STORAGE_IMPL_TYPE])
|
||||
68
rag/utils/tavily_conn.py
Normal file
68
rag/utils/tavily_conn.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
from tavily import TavilyClient
|
||||
from api.utils import get_uuid
|
||||
from rag.nlp import rag_tokenizer
|
||||
|
||||
|
||||
class Tavily:
|
||||
def __init__(self, api_key: str):
|
||||
self.tavily_client = TavilyClient(api_key=api_key)
|
||||
|
||||
def search(self, query):
|
||||
try:
|
||||
response = self.tavily_client.search(
|
||||
query=query,
|
||||
search_depth="advanced",
|
||||
max_results=6
|
||||
)
|
||||
return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]]
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
|
||||
return []
|
||||
|
||||
def retrieve_chunks(self, question):
|
||||
chunks = []
|
||||
aggs = []
|
||||
logging.info("[Tavily]Q: " + question)
|
||||
for r in self.search(question):
|
||||
id = get_uuid()
|
||||
chunks.append({
|
||||
"chunk_id": id,
|
||||
"content_ltks": rag_tokenizer.tokenize(r["content"]),
|
||||
"content_with_weight": r["content"],
|
||||
"doc_id": id,
|
||||
"docnm_kwd": r["title"],
|
||||
"kb_id": [],
|
||||
"important_kwd": [],
|
||||
"image_id": "",
|
||||
"similarity": r["score"],
|
||||
"vector_similarity": 1.,
|
||||
"term_similarity": 0,
|
||||
"vector": [],
|
||||
"positions": [],
|
||||
"url": r["url"]
|
||||
})
|
||||
aggs.append({
|
||||
"doc_name": r["title"],
|
||||
"doc_id": id,
|
||||
"count": 1,
|
||||
"url": r["url"]
|
||||
})
|
||||
logging.info("[Tavily]R: "+r["content"][:128]+"...")
|
||||
return {"chunks": chunks, "doc_aggs": aggs}
|
||||
Reference in New Issue
Block a user