将flask改成fastapi
This commit is contained in:
132
api/utils/__init__.py
Normal file
132
api/utils/__init__.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import base64
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
|
||||
import importlib
|
||||
|
||||
from .common import string_to_bytes
|
||||
|
||||
|
||||
def current_timestamp():
|
||||
return int(time.time() * 1000)
|
||||
|
||||
|
||||
def timestamp_to_date(timestamp, format_string="%Y-%m-%d %H:%M:%S"):
|
||||
if not timestamp:
|
||||
timestamp = time.time()
|
||||
timestamp = int(timestamp) / 1000
|
||||
time_array = time.localtime(timestamp)
|
||||
str_date = time.strftime(format_string, time_array)
|
||||
return str_date
|
||||
|
||||
|
||||
def date_string_to_timestamp(time_str, format_string="%Y-%m-%d %H:%M:%S"):
|
||||
time_array = time.strptime(time_str, format_string)
|
||||
time_stamp = int(time.mktime(time_array) * 1000)
|
||||
return time_stamp
|
||||
|
||||
|
||||
def get_lan_ip():
|
||||
if os.name != "nt":
|
||||
import fcntl
|
||||
import struct
|
||||
|
||||
def get_interface_ip(ifname):
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
return socket.inet_ntoa(
|
||||
fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', string_to_bytes(ifname[:15])))[20:24])
|
||||
|
||||
ip = socket.gethostbyname(socket.getfqdn())
|
||||
if ip.startswith("127.") and os.name != "nt":
|
||||
interfaces = [
|
||||
"bond1",
|
||||
"eth0",
|
||||
"eth1",
|
||||
"eth2",
|
||||
"wlan0",
|
||||
"wlan1",
|
||||
"wifi0",
|
||||
"ath0",
|
||||
"ath1",
|
||||
"ppp0",
|
||||
]
|
||||
for ifname in interfaces:
|
||||
try:
|
||||
ip = get_interface_ip(ifname)
|
||||
break
|
||||
except IOError:
|
||||
pass
|
||||
return ip or ''
|
||||
|
||||
|
||||
def from_dict_hook(in_dict: dict):
|
||||
if "type" in in_dict and "data" in in_dict:
|
||||
if in_dict["module"] is None:
|
||||
return in_dict["data"]
|
||||
else:
|
||||
return getattr(importlib.import_module(
|
||||
in_dict["module"]), in_dict["type"])(**in_dict["data"])
|
||||
else:
|
||||
return in_dict
|
||||
|
||||
|
||||
def get_uuid():
|
||||
return uuid.uuid1().hex
|
||||
|
||||
|
||||
def datetime_format(date_time: datetime.datetime) -> datetime.datetime:
|
||||
return datetime.datetime(date_time.year, date_time.month, date_time.day,
|
||||
date_time.hour, date_time.minute, date_time.second)
|
||||
|
||||
|
||||
def get_format_time() -> datetime.datetime:
|
||||
return datetime_format(datetime.datetime.now())
|
||||
|
||||
|
||||
def str2date(date_time: str):
|
||||
return datetime.datetime.strptime(date_time, '%Y-%m-%d')
|
||||
|
||||
|
||||
def elapsed2time(elapsed):
|
||||
seconds = elapsed / 1000
|
||||
minuter, second = divmod(seconds, 60)
|
||||
hour, minuter = divmod(minuter, 60)
|
||||
return '%02d:%02d:%02d' % (hour, minuter, second)
|
||||
|
||||
|
||||
def download_img(url):
|
||||
if not url:
|
||||
return ""
|
||||
response = requests.get(url)
|
||||
return "data:" + \
|
||||
response.headers.get('Content-Type', 'image/jpg') + ";" + \
|
||||
"base64," + base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
|
||||
def delta_seconds(date_string: str):
|
||||
dt = datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
|
||||
return (datetime.datetime.now() - dt).total_seconds()
|
||||
|
||||
|
||||
def hash_str2int(line: str, mod: int = 10 ** 8) -> int:
|
||||
return int(hashlib.sha1(line.encode("utf-8")).hexdigest(), 16) % mod
|
||||
873
api/utils/api_utils.py
Normal file
873
api/utils/api_utils.py
Normal file
@@ -0,0 +1,873 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import asyncio
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from base64 import b64encode
|
||||
from copy import deepcopy
|
||||
from functools import wraps
|
||||
from hmac import HMAC
|
||||
from io import BytesIO
|
||||
from typing import Any, Callable, Coroutine, Optional, Type, Union
|
||||
from urllib.parse import quote, urlencode
|
||||
from uuid import uuid1
|
||||
|
||||
import requests
|
||||
import trio
|
||||
# FastAPI imports
|
||||
from fastapi import Request, Response as FastAPIResponse, HTTPException, status
|
||||
from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
|
||||
from fastapi import Depends
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from itsdangerous import URLSafeTimedSerializer
|
||||
from peewee import OperationalError
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
from api import settings
|
||||
from api.constants import REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC
|
||||
from api.db import ActiveEnum
|
||||
from api.db.db_models import APIToken
|
||||
from api.db.services import UserService
|
||||
from api.db.services.llm_service import LLMService
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
from api.utils.json import CustomJSONEncoder, json_dumps
|
||||
|
||||
# FastAPI 安全方案
|
||||
security = HTTPBearer()
|
||||
from api.utils import get_uuid
|
||||
from rag.utils.mcp_tool_call_conn import MCPToolCallSession, close_multiple_mcp_toolcall_sessions
|
||||
|
||||
requests.models.complexjson.dumps = functools.partial(json.dumps, cls=CustomJSONEncoder)
|
||||
|
||||
def serialize_for_json(obj):
|
||||
"""
|
||||
Recursively serialize objects to make them JSON serializable.
|
||||
Handles ModelMetaclass and other non-serializable objects.
|
||||
"""
|
||||
if hasattr(obj, '__dict__'):
|
||||
# For objects with __dict__, try to serialize their attributes
|
||||
try:
|
||||
return {key: serialize_for_json(value) for key, value in obj.__dict__.items()
|
||||
if not key.startswith('_')}
|
||||
except (AttributeError, TypeError):
|
||||
return str(obj)
|
||||
elif hasattr(obj, '__name__'):
|
||||
# For classes and metaclasses, return their name
|
||||
return f"<{obj.__module__}.{obj.__name__}>" if hasattr(obj, '__module__') else f"<{obj.__name__}>"
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [serialize_for_json(item) for item in obj]
|
||||
elif isinstance(obj, dict):
|
||||
return {key: serialize_for_json(value) for key, value in obj.items()}
|
||||
elif isinstance(obj, (str, int, float, bool)) or obj is None:
|
||||
return obj
|
||||
else:
|
||||
# Fallback: convert to string representation
|
||||
return str(obj)
|
||||
|
||||
def request(**kwargs):
|
||||
sess = requests.Session()
|
||||
stream = kwargs.pop("stream", sess.stream)
|
||||
timeout = kwargs.pop("timeout", None)
|
||||
kwargs["headers"] = {k.replace("_", "-").upper(): v for k, v in kwargs.get("headers", {}).items()}
|
||||
prepped = requests.Request(**kwargs).prepare()
|
||||
|
||||
if settings.CLIENT_AUTHENTICATION and settings.HTTP_APP_KEY and settings.SECRET_KEY:
|
||||
timestamp = str(round(time() * 1000))
|
||||
nonce = str(uuid1())
|
||||
signature = b64encode(
|
||||
HMAC(
|
||||
settings.SECRET_KEY.encode("ascii"),
|
||||
b"\n".join(
|
||||
[
|
||||
timestamp.encode("ascii"),
|
||||
nonce.encode("ascii"),
|
||||
settings.HTTP_APP_KEY.encode("ascii"),
|
||||
prepped.path_url.encode("ascii"),
|
||||
prepped.body if kwargs.get("json") else b"",
|
||||
urlencode(sorted(kwargs["data"].items()), quote_via=quote, safe="-._~").encode("ascii") if kwargs.get("data") and isinstance(kwargs["data"], dict) else b"",
|
||||
]
|
||||
),
|
||||
"sha1",
|
||||
).digest()
|
||||
).decode("ascii")
|
||||
|
||||
prepped.headers.update(
|
||||
{
|
||||
"TIMESTAMP": timestamp,
|
||||
"NONCE": nonce,
|
||||
"APP-KEY": settings.HTTP_APP_KEY,
|
||||
"SIGNATURE": signature,
|
||||
}
|
||||
)
|
||||
|
||||
return sess.send(prepped, stream=stream, timeout=timeout)
|
||||
|
||||
|
||||
def get_exponential_backoff_interval(retries, full_jitter=False):
|
||||
"""Calculate the exponential backoff wait time."""
|
||||
# Will be zero if factor equals 0
|
||||
countdown = min(REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC * (2**retries))
|
||||
# Full jitter according to
|
||||
# https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
|
||||
if full_jitter:
|
||||
countdown = random.randrange(countdown + 1)
|
||||
# Adjust according to maximum wait time and account for negative values.
|
||||
return max(0, countdown)
|
||||
|
||||
|
||||
def get_data_error_result(code=settings.RetCode.DATA_ERROR, message="Sorry! Data missing!"):
|
||||
logging.exception(Exception(message))
|
||||
result_dict = {"code": code, "message": message}
|
||||
response = {}
|
||||
for key, value in result_dict.items():
|
||||
if value is None and key != "code":
|
||||
continue
|
||||
else:
|
||||
response[key] = value
|
||||
return JSONResponse(content=response)
|
||||
|
||||
|
||||
def server_error_response(e):
|
||||
logging.exception(e)
|
||||
try:
|
||||
if e.code == 401:
|
||||
return get_json_result(code=401, message=repr(e))
|
||||
except BaseException:
|
||||
pass
|
||||
if len(e.args) > 1:
|
||||
try:
|
||||
serialized_data = serialize_for_json(e.args[1])
|
||||
return get_json_result(code= settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=serialized_data)
|
||||
except Exception:
|
||||
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=None)
|
||||
if repr(e).find("index_not_found_exception") >= 0:
|
||||
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message="No chunk found, please upload file and parse it.")
|
||||
|
||||
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
|
||||
|
||||
|
||||
def error_response(response_code, message=None):
|
||||
if message is None:
|
||||
message = HTTP_STATUS_CODES.get(response_code, "Unknown Error")
|
||||
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": message,
|
||||
"code": response_code,
|
||||
},
|
||||
status_code=response_code,
|
||||
)
|
||||
|
||||
|
||||
# FastAPI 版本:使用 Pydantic 模型进行验证,而不是装饰器
|
||||
# 这个装饰器在 FastAPI 中不再需要,因为 FastAPI 会自动验证 Pydantic 模型
|
||||
def validate_request(*args, **kwargs):
|
||||
"""
|
||||
废弃的装饰器:在 FastAPI 中使用 Pydantic 模型进行验证
|
||||
这个函数保留是为了向后兼容,但不会执行任何验证
|
||||
"""
|
||||
def wrapper(func):
|
||||
@wraps(func)
|
||||
def decorated_function(*_args, **_kwargs):
|
||||
# FastAPI 中不需要手动验证,Pydantic 会自动处理
|
||||
return func(*_args, **_kwargs)
|
||||
return decorated_function
|
||||
return wrapper
|
||||
|
||||
|
||||
def not_allowed_parameters(*params):
|
||||
"""
|
||||
废弃的装饰器:在 FastAPI 中使用 Pydantic 模型进行验证
|
||||
这个函数保留是为了向后兼容,但不会执行任何验证
|
||||
"""
|
||||
def decorator(f):
|
||||
def wrapper(*args, **kwargs):
|
||||
# FastAPI 中不需要手动验证,Pydantic 会自动处理
|
||||
return f(*args, **kwargs)
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def active_required(f):
|
||||
"""
|
||||
废弃的装饰器:在 FastAPI 中使用依赖注入进行用户验证
|
||||
这个函数保留是为了向后兼容,但不会执行任何验证
|
||||
"""
|
||||
@wraps(f)
|
||||
def wrapper(*args, **kwargs):
|
||||
# FastAPI 中使用依赖注入进行用户验证
|
||||
return f(*args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
|
||||
def is_localhost(ip):
|
||||
return ip in {"127.0.0.1", "::1", "[::1]", "localhost"}
|
||||
|
||||
|
||||
def send_file_in_mem(data, filename):
|
||||
"""
|
||||
发送内存中的文件数据
|
||||
注意:在 FastAPI 中,这个函数需要接收 Request 参数来正确处理响应
|
||||
"""
|
||||
if not isinstance(data, (str, bytes)):
|
||||
data = json_dumps(data)
|
||||
if isinstance(data, str):
|
||||
data = data.encode("utf-8")
|
||||
|
||||
f = BytesIO()
|
||||
f.write(data)
|
||||
f.seek(0)
|
||||
|
||||
# 在 FastAPI 中,应该使用 FileResponse 或 StreamingResponse
|
||||
# 这里返回文件对象,调用者需要处理响应
|
||||
return f
|
||||
|
||||
|
||||
def get_json_result(code=settings.RetCode.SUCCESS, message="success", data=None):
|
||||
response = {"code": code, "message": message, "data": data}
|
||||
return JSONResponse(content=response)
|
||||
|
||||
|
||||
def apikey_required(func):
|
||||
"""
|
||||
废弃的装饰器:在 FastAPI 中使用依赖注入进行 API Key 验证
|
||||
这个函数保留是为了向后兼容,但不会执行任何验证
|
||||
"""
|
||||
@wraps(func)
|
||||
def decorated_function(*args, **kwargs):
|
||||
# FastAPI 中使用依赖注入进行 API Key 验证
|
||||
return func(*args, **kwargs)
|
||||
return decorated_function
|
||||
|
||||
|
||||
def build_error_result(code=settings.RetCode.FORBIDDEN, message="success"):
|
||||
response = {"code": code, "message": message}
|
||||
return JSONResponse(content=response, status_code=code)
|
||||
|
||||
|
||||
def construct_response(code=settings.RetCode.SUCCESS, message="success", data=None, auth=None):
|
||||
result_dict = {"code": code, "message": message, "data": data}
|
||||
response_dict = {}
|
||||
for key, value in result_dict.items():
|
||||
if value is None and key != "code":
|
||||
continue
|
||||
else:
|
||||
response_dict[key] = value
|
||||
|
||||
headers = {
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
"Access-Control-Allow-Method": "*",
|
||||
"Access-Control-Allow-Headers": "*",
|
||||
"Access-Control-Expose-Headers": "Authorization"
|
||||
}
|
||||
if auth:
|
||||
headers["Authorization"] = auth
|
||||
|
||||
return JSONResponse(content=response_dict, headers=headers)
|
||||
|
||||
|
||||
def construct_result(code=settings.RetCode.DATA_ERROR, message="data is missing"):
|
||||
result_dict = {"code": code, "message": message}
|
||||
response = {}
|
||||
for key, value in result_dict.items():
|
||||
if value is None and key != "code":
|
||||
continue
|
||||
else:
|
||||
response[key] = value
|
||||
return JSONResponse(content=response)
|
||||
|
||||
|
||||
def construct_json_result(code=settings.RetCode.SUCCESS, message="success", data=None):
|
||||
if data is None:
|
||||
return JSONResponse(content={"code": code, "message": message})
|
||||
else:
|
||||
return JSONResponse(content={"code": code, "message": message, "data": data})
|
||||
|
||||
|
||||
def construct_error_response(e):
|
||||
logging.exception(e)
|
||||
try:
|
||||
if e.code == 401:
|
||||
return construct_json_result(code=settings.RetCode.UNAUTHORIZED, message=repr(e))
|
||||
except BaseException:
|
||||
pass
|
||||
if len(e.args) > 1:
|
||||
return construct_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
|
||||
return construct_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
|
||||
|
||||
|
||||
def token_required(func):
|
||||
"""
|
||||
废弃的装饰器:在 FastAPI 中使用依赖注入进行 Token 验证
|
||||
这个函数保留是为了向后兼容,但不会执行任何验证
|
||||
"""
|
||||
@wraps(func)
|
||||
def decorated_function(*args, **kwargs):
|
||||
# FastAPI 中使用依赖注入进行 Token 验证
|
||||
return func(*args, **kwargs)
|
||||
return decorated_function
|
||||
|
||||
|
||||
def get_result(code=settings.RetCode.SUCCESS, message="", data=None):
|
||||
if code == 0:
|
||||
if data is not None:
|
||||
response = {"code": code, "data": data}
|
||||
else:
|
||||
response = {"code": code}
|
||||
else:
|
||||
response = {"code": code, "message": message}
|
||||
return JSONResponse(content=response)
|
||||
|
||||
|
||||
def get_error_data_result(
|
||||
message="Sorry! Data missing!",
|
||||
code=settings.RetCode.DATA_ERROR,
|
||||
):
|
||||
result_dict = {"code": code, "message": message}
|
||||
response = {}
|
||||
for key, value in result_dict.items():
|
||||
if value is None and key != "code":
|
||||
continue
|
||||
else:
|
||||
response[key] = value
|
||||
return JSONResponse(content=response)
|
||||
|
||||
|
||||
def get_error_argument_result(message="Invalid arguments"):
|
||||
return get_result(code=settings.RetCode.ARGUMENT_ERROR, message=message)
|
||||
|
||||
|
||||
# FastAPI 依赖注入函数
|
||||
async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
||||
"""获取当前用户 - FastAPI 版本"""
|
||||
from api.db import StatusEnum
|
||||
try:
|
||||
jwt = URLSafeTimedSerializer(secret_key=settings.SECRET_KEY)
|
||||
authorization = credentials.credentials
|
||||
|
||||
if authorization:
|
||||
try:
|
||||
access_token = str(jwt.loads(authorization))
|
||||
|
||||
if not access_token or not access_token.strip():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Authentication attempt with empty access token"
|
||||
)
|
||||
|
||||
# Access tokens should be UUIDs (32 hex characters)
|
||||
if len(access_token.strip()) < 32:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail=f"Authentication attempt with invalid token format: {len(access_token)} chars"
|
||||
)
|
||||
|
||||
user = UserService.query(
|
||||
access_token=access_token, status=StatusEnum.VALID.value
|
||||
)
|
||||
if user:
|
||||
if not user[0].access_token or not user[0].access_token.strip():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Authentication attempt with empty access token"
|
||||
)
|
||||
return user[0]
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Authentication failed: Invalid access token"
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail=f"Authentication failed: {str(e)}"
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Authentication failed: No authorization header"
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail=f"Authentication failed: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
async def get_current_user_optional(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
||||
"""获取当前用户(可选)- FastAPI 版本"""
|
||||
try:
|
||||
return await get_current_user(credentials)
|
||||
except HTTPException:
|
||||
return None
|
||||
|
||||
|
||||
async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
||||
"""验证 API Key - FastAPI 版本"""
|
||||
try:
|
||||
token = credentials.credentials
|
||||
objs = APIToken.query(token=token)
|
||||
if not objs:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="API-KEY is invalid!"
|
||||
)
|
||||
return objs[0]
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail=f"API Key verification failed: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
def create_file_response(data, filename: str, media_type: str = "application/octet-stream"):
|
||||
"""创建文件响应 - FastAPI 版本"""
|
||||
if not isinstance(data, (str, bytes)):
|
||||
data = json_dumps(data)
|
||||
if isinstance(data, str):
|
||||
data = data.encode("utf-8")
|
||||
|
||||
return StreamingResponse(
|
||||
BytesIO(data),
|
||||
media_type=media_type,
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
)
|
||||
|
||||
|
||||
def get_error_permission_result(message="Permission error"):
|
||||
return get_result(code=settings.RetCode.PERMISSION_ERROR, message=message)
|
||||
|
||||
|
||||
def get_error_operating_result(message="Operating error"):
|
||||
return get_result(code=settings.RetCode.OPERATING_ERROR, message=message)
|
||||
|
||||
|
||||
def generate_confirmation_token(tenant_id):
|
||||
serializer = URLSafeTimedSerializer(tenant_id)
|
||||
return "ragflow-" + serializer.dumps(get_uuid(), salt=tenant_id)[2:34]
|
||||
|
||||
|
||||
def get_parser_config(chunk_method, parser_config):
|
||||
if not chunk_method:
|
||||
chunk_method = "naive"
|
||||
|
||||
# Define default configurations for each chunking method
|
||||
key_mapping = {
|
||||
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"tag": None,
|
||||
"resume": None,
|
||||
"manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"table": None,
|
||||
"paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"one": None,
|
||||
"knowledge_graph": {
|
||||
"chunk_token_num": 8192,
|
||||
"delimiter": r"\n",
|
||||
"entity_types": ["organization", "person", "location", "event", "time"],
|
||||
"raptor": {"use_raptor": False},
|
||||
"graphrag": {"use_graphrag": False},
|
||||
},
|
||||
"email": None,
|
||||
"picture": None,
|
||||
}
|
||||
|
||||
default_config = key_mapping[chunk_method]
|
||||
|
||||
# If no parser_config provided, return default
|
||||
if not parser_config:
|
||||
return default_config
|
||||
|
||||
# If parser_config is provided, merge with defaults to ensure required fields exist
|
||||
if default_config is None:
|
||||
return parser_config
|
||||
|
||||
# Ensure raptor and graphrag fields have default values if not provided
|
||||
merged_config = deep_merge(default_config, parser_config)
|
||||
|
||||
return merged_config
|
||||
|
||||
|
||||
def get_data_openai(
|
||||
id=None,
|
||||
created=None,
|
||||
model=None,
|
||||
prompt_tokens=0,
|
||||
completion_tokens=0,
|
||||
content=None,
|
||||
finish_reason=None,
|
||||
object="chat.completion",
|
||||
param=None,
|
||||
stream=False
|
||||
):
|
||||
total_tokens = prompt_tokens + completion_tokens
|
||||
|
||||
if stream:
|
||||
return {
|
||||
"id": f"{id}",
|
||||
"object": "chat.completion.chunk",
|
||||
"model": model,
|
||||
"choices": [{
|
||||
"delta": {"content": content},
|
||||
"finish_reason": finish_reason,
|
||||
"index": 0,
|
||||
}],
|
||||
}
|
||||
|
||||
return {
|
||||
"id": f"{id}",
|
||||
"object": object,
|
||||
"created": int(time.time()) if created else None,
|
||||
"model": model,
|
||||
"param": param,
|
||||
"usage": {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": total_tokens,
|
||||
"completion_tokens_details": {
|
||||
"reasoning_tokens": 0,
|
||||
"accepted_prediction_tokens": 0,
|
||||
"rejected_prediction_tokens": 0,
|
||||
},
|
||||
},
|
||||
"choices": [{
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": content
|
||||
},
|
||||
"logprobs": None,
|
||||
"finish_reason": finish_reason,
|
||||
"index": 0,
|
||||
}],
|
||||
}
|
||||
|
||||
|
||||
def check_duplicate_ids(ids, id_type="item"):
|
||||
"""
|
||||
Check for duplicate IDs in a list and return unique IDs and error messages.
|
||||
|
||||
Args:
|
||||
ids (list): List of IDs to check for duplicates
|
||||
id_type (str): Type of ID for error messages (e.g., 'document', 'dataset', 'chunk')
|
||||
|
||||
Returns:
|
||||
tuple: (unique_ids, error_messages)
|
||||
- unique_ids (list): List of unique IDs
|
||||
- error_messages (list): List of error messages for duplicate IDs
|
||||
"""
|
||||
id_count = {}
|
||||
duplicate_messages = []
|
||||
|
||||
# Count occurrences of each ID
|
||||
for id_value in ids:
|
||||
id_count[id_value] = id_count.get(id_value, 0) + 1
|
||||
|
||||
# Check for duplicates
|
||||
for id_value, count in id_count.items():
|
||||
if count > 1:
|
||||
duplicate_messages.append(f"Duplicate {id_type} ids: {id_value}")
|
||||
|
||||
# Return unique IDs and error messages
|
||||
return list(set(ids)), duplicate_messages
|
||||
|
||||
|
||||
def verify_embedding_availability(embd_id: str, tenant_id: str) -> tuple[bool, JSONResponse | None]:
|
||||
"""
|
||||
Verifies availability of an embedding model for a specific tenant.
|
||||
|
||||
Performs comprehensive verification through:
|
||||
1. Identifier Parsing: Decomposes embd_id into name and factory components
|
||||
2. System Verification: Checks model registration in LLMService
|
||||
3. Tenant Authorization: Validates tenant-specific model assignments
|
||||
4. Built-in Model Check: Confirms inclusion in predefined system models
|
||||
|
||||
Args:
|
||||
embd_id (str): Unique identifier for the embedding model in format "model_name@factory"
|
||||
tenant_id (str): Tenant identifier for access control
|
||||
|
||||
Returns:
|
||||
tuple[bool, Response | None]:
|
||||
- First element (bool):
|
||||
- True: Model is available and authorized
|
||||
- False: Validation failed
|
||||
- Second element contains:
|
||||
- None on success
|
||||
- Error detail dict on failure
|
||||
|
||||
Raises:
|
||||
ValueError: When model identifier format is invalid
|
||||
OperationalError: When database connection fails (auto-handled)
|
||||
|
||||
Examples:
|
||||
>>> verify_embedding_availability("text-embedding@openai", "tenant_123")
|
||||
(True, None)
|
||||
|
||||
>>> verify_embedding_availability("invalid_model", "tenant_123")
|
||||
(False, {'code': 101, 'message': "Unsupported model: <invalid_model>"})
|
||||
"""
|
||||
try:
|
||||
llm_name, llm_factory = TenantLLMService.split_model_name_and_factory(embd_id)
|
||||
in_llm_service = bool(LLMService.query(llm_name=llm_name, fid=llm_factory, model_type="embedding"))
|
||||
|
||||
tenant_llms = TenantLLMService.get_my_llms(tenant_id=tenant_id)
|
||||
is_tenant_model = any(llm["llm_name"] == llm_name and llm["llm_factory"] == llm_factory and llm["model_type"] == "embedding" for llm in tenant_llms)
|
||||
|
||||
is_builtin_model = embd_id in settings.BUILTIN_EMBEDDING_MODELS
|
||||
if not (is_builtin_model or is_tenant_model or in_llm_service):
|
||||
return False, get_error_argument_result(f"Unsupported model: <{embd_id}>")
|
||||
|
||||
if not (is_builtin_model or is_tenant_model):
|
||||
return False, get_error_argument_result(f"Unauthorized model: <{embd_id}>")
|
||||
except OperationalError as e:
|
||||
logging.exception(e)
|
||||
return False, get_error_data_result(message="Database operation failed")
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
def deep_merge(default: dict, custom: dict) -> dict:
|
||||
"""
|
||||
Recursively merges two dictionaries with priority given to `custom` values.
|
||||
|
||||
Creates a deep copy of the `default` dictionary and iteratively merges nested
|
||||
dictionaries using a stack-based approach. Non-dict values in `custom` will
|
||||
completely override corresponding entries in `default`.
|
||||
|
||||
Args:
|
||||
default (dict): Base dictionary containing default values.
|
||||
custom (dict): Dictionary containing overriding values.
|
||||
|
||||
Returns:
|
||||
dict: New merged dictionary combining values from both inputs.
|
||||
|
||||
Example:
|
||||
>>> from copy import deepcopy
|
||||
>>> default = {"a": 1, "nested": {"x": 10, "y": 20}}
|
||||
>>> custom = {"b": 2, "nested": {"y": 99, "z": 30}}
|
||||
>>> deep_merge(default, custom)
|
||||
{'a': 1, 'b': 2, 'nested': {'x': 10, 'y': 99, 'z': 30}}
|
||||
|
||||
>>> deep_merge({"config": {"mode": "auto"}}, {"config": "manual"})
|
||||
{'config': 'manual'}
|
||||
|
||||
Notes:
|
||||
1. Merge priority is always given to `custom` values at all nesting levels
|
||||
2. Non-dict values (e.g. list, str) in `custom` will replace entire values
|
||||
in `default`, even if the original value was a dictionary
|
||||
3. Time complexity: O(N) where N is total key-value pairs in `custom`
|
||||
4. Recommended for configuration merging and nested data updates
|
||||
"""
|
||||
merged = deepcopy(default)
|
||||
stack = [(merged, custom)]
|
||||
|
||||
while stack:
|
||||
base_dict, override_dict = stack.pop()
|
||||
|
||||
for key, val in override_dict.items():
|
||||
if key in base_dict and isinstance(val, dict) and isinstance(base_dict[key], dict):
|
||||
stack.append((base_dict[key], val))
|
||||
else:
|
||||
base_dict[key] = val
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def remap_dictionary_keys(source_data: dict, key_aliases: dict = None) -> dict:
|
||||
"""
|
||||
Transform dictionary keys using a configurable mapping schema.
|
||||
|
||||
Args:
|
||||
source_data: Original dictionary to process
|
||||
key_aliases: Custom key transformation rules (Optional)
|
||||
When provided, overrides default key mapping
|
||||
Format: {<original_key>: <new_key>, ...}
|
||||
|
||||
Returns:
|
||||
dict: New dictionary with transformed keys preserving original values
|
||||
|
||||
Example:
|
||||
>>> input_data = {"old_key": "value", "another_field": 42}
|
||||
>>> remap_dictionary_keys(input_data, {"old_key": "new_key"})
|
||||
{'new_key': 'value', 'another_field': 42}
|
||||
"""
|
||||
DEFAULT_KEY_MAP = {
|
||||
"chunk_num": "chunk_count",
|
||||
"doc_num": "document_count",
|
||||
"parser_id": "chunk_method",
|
||||
"embd_id": "embedding_model",
|
||||
}
|
||||
|
||||
transformed_data = {}
|
||||
mapping = key_aliases or DEFAULT_KEY_MAP
|
||||
|
||||
for original_key, value in source_data.items():
|
||||
mapped_key = mapping.get(original_key, original_key)
|
||||
transformed_data[mapped_key] = value
|
||||
|
||||
return transformed_data
|
||||
|
||||
|
||||
def group_by(list_of_dict, key):
|
||||
res = {}
|
||||
for item in list_of_dict:
|
||||
if item[key] in res.keys():
|
||||
res[item[key]].append(item)
|
||||
else:
|
||||
res[item[key]] = [item]
|
||||
return res
|
||||
|
||||
|
||||
def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, str]:
|
||||
results = {}
|
||||
tool_call_sessions = []
|
||||
try:
|
||||
for mcp_server in mcp_servers:
|
||||
server_key = mcp_server.id
|
||||
|
||||
cached_tools = mcp_server.variables.get("tools", {})
|
||||
|
||||
tool_call_session = MCPToolCallSession(mcp_server, mcp_server.variables)
|
||||
tool_call_sessions.append(tool_call_session)
|
||||
|
||||
try:
|
||||
tools = tool_call_session.get_tools(timeout)
|
||||
except Exception:
|
||||
tools = []
|
||||
|
||||
results[server_key] = []
|
||||
for tool in tools:
|
||||
tool_dict = tool.model_dump()
|
||||
cached_tool = cached_tools.get(tool_dict["name"], {})
|
||||
|
||||
tool_dict["enabled"] = cached_tool.get("enabled", True)
|
||||
results[server_key].append(tool_dict)
|
||||
|
||||
# PERF: blocking call to close sessions — consider moving to background thread or task queue
|
||||
close_multiple_mcp_toolcall_sessions(tool_call_sessions)
|
||||
return results, ""
|
||||
except Exception as e:
|
||||
return {}, str(e)
|
||||
|
||||
|
||||
TimeoutException = Union[Type[BaseException], BaseException]
|
||||
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
|
||||
|
||||
|
||||
def timeout(seconds: float | int | str = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
|
||||
if isinstance(seconds, str):
|
||||
seconds = float(seconds)
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
result_queue = queue.Queue(maxsize=1)
|
||||
|
||||
def target():
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
result_queue.put(result)
|
||||
except Exception as e:
|
||||
result_queue.put(e)
|
||||
|
||||
thread = threading.Thread(target=target)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
for a in range(attempts):
|
||||
try:
|
||||
if os.environ.get("ENABLE_TIMEOUT_ASSERTION"):
|
||||
result = result_queue.get(timeout=seconds)
|
||||
else:
|
||||
result = result_queue.get()
|
||||
if isinstance(result, Exception):
|
||||
raise result
|
||||
return result
|
||||
except queue.Empty:
|
||||
pass
|
||||
raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds and {attempts} attempts.")
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs) -> Any:
|
||||
if seconds is None:
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
for a in range(attempts):
|
||||
try:
|
||||
if os.environ.get("ENABLE_TIMEOUT_ASSERTION"):
|
||||
with trio.fail_after(seconds):
|
||||
return await func(*args, **kwargs)
|
||||
else:
|
||||
return await func(*args, **kwargs)
|
||||
except trio.TooSlowError:
|
||||
if a < attempts - 1:
|
||||
continue
|
||||
if on_timeout is not None:
|
||||
if callable(on_timeout):
|
||||
result = on_timeout()
|
||||
if isinstance(result, Coroutine):
|
||||
return await result
|
||||
return result
|
||||
return on_timeout
|
||||
|
||||
if exception is None:
|
||||
raise TimeoutError(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
|
||||
|
||||
if isinstance(exception, BaseException):
|
||||
raise exception
|
||||
|
||||
if isinstance(exception, type) and issubclass(exception, BaseException):
|
||||
raise exception(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
|
||||
|
||||
raise RuntimeError("Invalid exception type provided")
|
||||
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
async def is_strong_enough(chat_model, embedding_model):
|
||||
count = settings.STRONG_TEST_COUNT
|
||||
if not chat_model or not embedding_model:
|
||||
return
|
||||
if isinstance(count, int) and count <= 0:
|
||||
return
|
||||
|
||||
@timeout(60, 2)
|
||||
async def _is_strong_enough():
|
||||
nonlocal chat_model, embedding_model
|
||||
if embedding_model:
|
||||
with trio.fail_after(10):
|
||||
_ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
|
||||
if chat_model:
|
||||
with trio.fail_after(30):
|
||||
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {}))
|
||||
if res.find("**ERROR**") >= 0:
|
||||
raise Exception(res)
|
||||
|
||||
# Pressure test for GraphRAG task
|
||||
async with trio.open_nursery() as nursery:
|
||||
for _ in range(count):
|
||||
nursery.start_soon(_is_strong_enough)
|
||||
56
api/utils/base64_image.py
Normal file
56
api/utils/base64_image.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import base64
|
||||
import logging
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
|
||||
from PIL import Image
|
||||
|
||||
test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAIAAAD/gAIDAAAA6ElEQVR4nO3QwQ3AIBDAsIP9d25XIC+EZE8QZc18w5l9O+AlZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBT+IYAHHLHkdEgAAAABJRU5ErkJggg=="
|
||||
test_image = base64.b64decode(test_image_base64)
|
||||
|
||||
|
||||
async def image2id(d: dict, storage_put_func: partial, objname:str, bucket:str="imagetemps"):
|
||||
import logging
|
||||
from io import BytesIO
|
||||
import trio
|
||||
from rag.svr.task_executor import minio_limiter
|
||||
if not d.get("image"):
|
||||
return
|
||||
|
||||
with BytesIO() as output_buffer:
|
||||
if isinstance(d["image"], bytes):
|
||||
output_buffer.write(d["image"])
|
||||
output_buffer.seek(0)
|
||||
else:
|
||||
# If the image is in RGBA mode, convert it to RGB mode before saving it in JPEG format.
|
||||
if d["image"].mode in ("RGBA", "P"):
|
||||
converted_image = d["image"].convert("RGB")
|
||||
d["image"] = converted_image
|
||||
try:
|
||||
d["image"].save(output_buffer, format='JPEG')
|
||||
except OSError as e:
|
||||
logging.warning(
|
||||
"Saving image exception, ignore: {}".format(str(e)))
|
||||
|
||||
async with minio_limiter:
|
||||
await trio.to_thread.run_sync(lambda: storage_put_func(bucket=bucket, fnm=objname, binary=output_buffer.getvalue()))
|
||||
d["img_id"] = f"{bucket}-{objname}"
|
||||
if not isinstance(d["image"], bytes):
|
||||
d["image"].close()
|
||||
del d["image"] # Remove image reference
|
||||
|
||||
|
||||
def id2image(image_id:str|None, storage_get_func: partial):
|
||||
if not image_id:
|
||||
return
|
||||
arr = image_id.split("-")
|
||||
if len(arr) != 2:
|
||||
return
|
||||
bkt, nm = image_id.split("-")
|
||||
try:
|
||||
blob = storage_get_func(bucket=bkt, filename=nm)
|
||||
if not blob:
|
||||
return
|
||||
return Image.open(BytesIO(blob))
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
78
api/utils/commands.py
Normal file
78
api/utils/commands.py
Normal file
@@ -0,0 +1,78 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import base64
|
||||
import click
|
||||
import re
|
||||
|
||||
from flask import Flask
|
||||
from werkzeug.security import generate_password_hash
|
||||
|
||||
from api.db.services import UserService
|
||||
|
||||
|
||||
@click.command('reset-password', help='Reset the account password.')
|
||||
@click.option('--email', prompt=True, help='The email address of the account whose password you need to reset')
|
||||
@click.option('--new-password', prompt=True, help='the new password.')
|
||||
@click.option('--password-confirm', prompt=True, help='the new password confirm.')
|
||||
def reset_password(email, new_password, password_confirm):
|
||||
if str(new_password).strip() != str(password_confirm).strip():
|
||||
click.echo(click.style('sorry. The two passwords do not match.', fg='red'))
|
||||
return
|
||||
user = UserService.query(email=email)
|
||||
if not user:
|
||||
click.echo(click.style('sorry. The Email is not registered!.', fg='red'))
|
||||
return
|
||||
encode_password = base64.b64encode(new_password.encode('utf-8')).decode('utf-8')
|
||||
password_hash = generate_password_hash(encode_password)
|
||||
user_dict = {
|
||||
'password': password_hash
|
||||
}
|
||||
UserService.update_user(user[0].id,user_dict)
|
||||
click.echo(click.style('Congratulations! Password has been reset.', fg='green'))
|
||||
|
||||
|
||||
@click.command('reset-email', help='Reset the account email.')
|
||||
@click.option('--email', prompt=True, help='The old email address of the account whose email you need to reset')
|
||||
@click.option('--new-email', prompt=True, help='the new email.')
|
||||
@click.option('--email-confirm', prompt=True, help='the new email confirm.')
|
||||
def reset_email(email, new_email, email_confirm):
|
||||
if str(new_email).strip() != str(email_confirm).strip():
|
||||
click.echo(click.style('Sorry, new email and confirm email do not match.', fg='red'))
|
||||
return
|
||||
if str(new_email).strip() == str(email).strip():
|
||||
click.echo(click.style('Sorry, new email and old email are the same.', fg='red'))
|
||||
return
|
||||
user = UserService.query(email=email)
|
||||
if not user:
|
||||
click.echo(click.style('sorry. the account: [{}] not exist .'.format(email), fg='red'))
|
||||
return
|
||||
if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,4}$", new_email):
|
||||
click.echo(click.style('sorry. {} is not a valid email. '.format(new_email), fg='red'))
|
||||
return
|
||||
new_user = UserService.query(email=new_email)
|
||||
if new_user:
|
||||
click.echo(click.style('sorry. the account: [{}] is exist .'.format(new_email), fg='red'))
|
||||
return
|
||||
user_dict = {
|
||||
'email': new_email
|
||||
}
|
||||
UserService.update_user(user[0].id,user_dict)
|
||||
click.echo(click.style('Congratulations!, email has been reset.', fg='green'))
|
||||
|
||||
def register_commands(app: Flask):
|
||||
app.cli.add_command(reset_password)
|
||||
app.cli.add_command(reset_email)
|
||||
46
api/utils/common.py
Normal file
46
api/utils/common.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
def string_to_bytes(string):
|
||||
return string if isinstance(
|
||||
string, bytes) else string.encode(encoding="utf-8")
|
||||
|
||||
|
||||
def bytes_to_string(byte):
|
||||
return byte.decode(encoding="utf-8")
|
||||
|
||||
|
||||
def convert_bytes(size_in_bytes: int) -> str:
|
||||
"""
|
||||
Format size in bytes.
|
||||
"""
|
||||
if size_in_bytes == 0:
|
||||
return "0 B"
|
||||
|
||||
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
|
||||
i = 0
|
||||
size = float(size_in_bytes)
|
||||
|
||||
while size >= 1024 and i < len(units) - 1:
|
||||
size /= 1024
|
||||
i += 1
|
||||
|
||||
if i == 0 or size >= 100:
|
||||
return f"{size:.0f} {units[i]}"
|
||||
elif size >= 10:
|
||||
return f"{size:.1f} {units[i]}"
|
||||
else:
|
||||
return f"{size:.2f} {units[i]}"
|
||||
179
api/utils/configs.py
Normal file
179
api/utils/configs.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import io
|
||||
import copy
|
||||
import logging
|
||||
import base64
|
||||
import pickle
|
||||
import importlib
|
||||
|
||||
from api.utils import file_utils
|
||||
from filelock import FileLock
|
||||
from api.utils.common import bytes_to_string, string_to_bytes
|
||||
from api.constants import SERVICE_CONF
|
||||
|
||||
|
||||
def conf_realpath(conf_name):
|
||||
conf_path = f"conf/{conf_name}"
|
||||
return os.path.join(file_utils.get_project_base_directory(), conf_path)
|
||||
|
||||
|
||||
def read_config(conf_name=SERVICE_CONF):
|
||||
local_config = {}
|
||||
local_path = conf_realpath(f'local.{conf_name}')
|
||||
|
||||
# load local config file
|
||||
if os.path.exists(local_path):
|
||||
local_config = file_utils.load_yaml_conf(local_path)
|
||||
if not isinstance(local_config, dict):
|
||||
raise ValueError(f'Invalid config file: "{local_path}".')
|
||||
|
||||
global_config_path = conf_realpath(conf_name)
|
||||
global_config = file_utils.load_yaml_conf(global_config_path)
|
||||
|
||||
if not isinstance(global_config, dict):
|
||||
raise ValueError(f'Invalid config file: "{global_config_path}".')
|
||||
|
||||
global_config.update(local_config)
|
||||
return global_config
|
||||
|
||||
|
||||
CONFIGS = read_config()
|
||||
|
||||
|
||||
def show_configs():
|
||||
msg = f"Current configs, from {conf_realpath(SERVICE_CONF)}:"
|
||||
for k, v in CONFIGS.items():
|
||||
if isinstance(v, dict):
|
||||
if "password" in v:
|
||||
v = copy.deepcopy(v)
|
||||
v["password"] = "*" * 8
|
||||
if "access_key" in v:
|
||||
v = copy.deepcopy(v)
|
||||
v["access_key"] = "*" * 8
|
||||
if "secret_key" in v:
|
||||
v = copy.deepcopy(v)
|
||||
v["secret_key"] = "*" * 8
|
||||
if "secret" in v:
|
||||
v = copy.deepcopy(v)
|
||||
v["secret"] = "*" * 8
|
||||
if "sas_token" in v:
|
||||
v = copy.deepcopy(v)
|
||||
v["sas_token"] = "*" * 8
|
||||
if "oauth" in k:
|
||||
v = copy.deepcopy(v)
|
||||
for key, val in v.items():
|
||||
if "client_secret" in val:
|
||||
val["client_secret"] = "*" * 8
|
||||
if "authentication" in k:
|
||||
v = copy.deepcopy(v)
|
||||
for key, val in v.items():
|
||||
if "http_secret_key" in val:
|
||||
val["http_secret_key"] = "*" * 8
|
||||
msg += f"\n\t{k}: {v}"
|
||||
logging.info(msg)
|
||||
|
||||
|
||||
def get_base_config(key, default=None):
|
||||
if key is None:
|
||||
return None
|
||||
if default is None:
|
||||
default = os.environ.get(key.upper())
|
||||
return CONFIGS.get(key, default)
|
||||
|
||||
|
||||
def decrypt_database_password(password):
|
||||
encrypt_password = get_base_config("encrypt_password", False)
|
||||
encrypt_module = get_base_config("encrypt_module", False)
|
||||
private_key = get_base_config("private_key", None)
|
||||
|
||||
if not password or not encrypt_password:
|
||||
return password
|
||||
|
||||
if not private_key:
|
||||
raise ValueError("No private key")
|
||||
|
||||
module_fun = encrypt_module.split("#")
|
||||
pwdecrypt_fun = getattr(
|
||||
importlib.import_module(
|
||||
module_fun[0]),
|
||||
module_fun[1])
|
||||
|
||||
return pwdecrypt_fun(private_key, password)
|
||||
|
||||
|
||||
def decrypt_database_config(
|
||||
database=None, passwd_key="password", name="database"):
|
||||
if not database:
|
||||
database = get_base_config(name, {})
|
||||
|
||||
database[passwd_key] = decrypt_database_password(database[passwd_key])
|
||||
return database
|
||||
|
||||
|
||||
def update_config(key, value, conf_name=SERVICE_CONF):
|
||||
conf_path = conf_realpath(conf_name=conf_name)
|
||||
if not os.path.isabs(conf_path):
|
||||
conf_path = os.path.join(
|
||||
file_utils.get_project_base_directory(), conf_path)
|
||||
|
||||
with FileLock(os.path.join(os.path.dirname(conf_path), ".lock")):
|
||||
config = file_utils.load_yaml_conf(conf_path=conf_path) or {}
|
||||
config[key] = value
|
||||
file_utils.rewrite_yaml_conf(conf_path=conf_path, config=config)
|
||||
|
||||
|
||||
safe_module = {
|
||||
'numpy',
|
||||
'rag_flow'
|
||||
}
|
||||
|
||||
|
||||
class RestrictedUnpickler(pickle.Unpickler):
|
||||
def find_class(self, module, name):
|
||||
import importlib
|
||||
if module.split('.')[0] in safe_module:
|
||||
_module = importlib.import_module(module)
|
||||
return getattr(_module, name)
|
||||
# Forbid everything else.
|
||||
raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
|
||||
(module, name))
|
||||
|
||||
|
||||
def restricted_loads(src):
|
||||
"""Helper function analogous to pickle.loads()."""
|
||||
return RestrictedUnpickler(io.BytesIO(src)).load()
|
||||
|
||||
|
||||
def serialize_b64(src, to_str=False):
|
||||
dest = base64.b64encode(pickle.dumps(src))
|
||||
if not to_str:
|
||||
return dest
|
||||
else:
|
||||
return bytes_to_string(dest)
|
||||
|
||||
|
||||
def deserialize_b64(src):
|
||||
src = base64.b64decode(
|
||||
string_to_bytes(src) if isinstance(
|
||||
src, str) else src)
|
||||
use_deserialize_safe_module = get_base_config(
|
||||
'use_deserialize_safe_module', False)
|
||||
if use_deserialize_safe_module:
|
||||
return restricted_loads(src)
|
||||
return pickle.loads(src)
|
||||
64
api/utils/crypt.py
Normal file
64
api/utils/crypt.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import base64
|
||||
import os
|
||||
import sys
|
||||
from Cryptodome.PublicKey import RSA
|
||||
from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5
|
||||
from api.utils import file_utils
|
||||
|
||||
|
||||
def crypt(line):
|
||||
"""
|
||||
decrypt(crypt(input_string)) == base64(input_string), which frontend and admin_client use.
|
||||
"""
|
||||
file_path = os.path.join(file_utils.get_project_base_directory(), "conf", "public.pem")
|
||||
rsa_key = RSA.importKey(open(file_path).read(), "Welcome")
|
||||
cipher = Cipher_pkcs1_v1_5.new(rsa_key)
|
||||
password_base64 = base64.b64encode(line.encode('utf-8')).decode("utf-8")
|
||||
encrypted_password = cipher.encrypt(password_base64.encode())
|
||||
return base64.b64encode(encrypted_password).decode('utf-8')
|
||||
|
||||
|
||||
def decrypt(line):
|
||||
file_path = os.path.join(file_utils.get_project_base_directory(), "conf", "private.pem")
|
||||
rsa_key = RSA.importKey(open(file_path).read(), "Welcome")
|
||||
cipher = Cipher_pkcs1_v1_5.new(rsa_key)
|
||||
return cipher.decrypt(base64.b64decode(line), "Fail to decrypt password!").decode('utf-8')
|
||||
|
||||
|
||||
def decrypt2(crypt_text):
|
||||
from base64 import b64decode, b16decode
|
||||
from Crypto.Cipher import PKCS1_v1_5 as Cipher_PKCS1_v1_5
|
||||
from Crypto.PublicKey import RSA
|
||||
decode_data = b64decode(crypt_text)
|
||||
if len(decode_data) == 127:
|
||||
hex_fixed = '00' + decode_data.hex()
|
||||
decode_data = b16decode(hex_fixed.upper())
|
||||
|
||||
file_path = os.path.join(file_utils.get_project_base_directory(), "conf", "private.pem")
|
||||
pem = open(file_path).read()
|
||||
rsa_key = RSA.importKey(pem, "Welcome")
|
||||
cipher = Cipher_PKCS1_v1_5.new(rsa_key)
|
||||
decrypt_text = cipher.decrypt(decode_data, None)
|
||||
return (b64decode(decrypt_text)).decode()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
passwd = crypt(sys.argv[1])
|
||||
print(passwd)
|
||||
print(decrypt(passwd))
|
||||
286
api/utils/file_utils.py
Normal file
286
api/utils/file_utils.py
Normal file
@@ -0,0 +1,286 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
from io import BytesIO
|
||||
|
||||
import pdfplumber
|
||||
from cachetools import LRUCache, cached
|
||||
from PIL import Image
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
from api.db import FileType
|
||||
|
||||
PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
|
||||
RAG_BASE = os.getenv("RAG_BASE")
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
|
||||
|
||||
def get_project_base_directory(*args):
|
||||
global PROJECT_BASE
|
||||
if PROJECT_BASE is None:
|
||||
PROJECT_BASE = os.path.abspath(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)),
|
||||
os.pardir,
|
||||
os.pardir,
|
||||
)
|
||||
)
|
||||
|
||||
if args:
|
||||
return os.path.join(PROJECT_BASE, *args)
|
||||
return PROJECT_BASE
|
||||
|
||||
|
||||
def get_rag_directory(*args):
|
||||
global RAG_BASE
|
||||
if RAG_BASE is None:
|
||||
RAG_BASE = os.path.abspath(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)),
|
||||
os.pardir,
|
||||
os.pardir,
|
||||
os.pardir,
|
||||
)
|
||||
)
|
||||
if args:
|
||||
return os.path.join(RAG_BASE, *args)
|
||||
return RAG_BASE
|
||||
|
||||
|
||||
def get_rag_python_directory(*args):
|
||||
return get_rag_directory("python", *args)
|
||||
|
||||
|
||||
def get_home_cache_dir():
|
||||
dir = os.path.join(os.path.expanduser("~"), ".ragflow")
|
||||
try:
|
||||
os.mkdir(dir)
|
||||
except OSError:
|
||||
pass
|
||||
return dir
|
||||
|
||||
|
||||
@cached(cache=LRUCache(maxsize=10))
|
||||
def load_json_conf(conf_path):
|
||||
if os.path.isabs(conf_path):
|
||||
json_conf_path = conf_path
|
||||
else:
|
||||
json_conf_path = os.path.join(get_project_base_directory(), conf_path)
|
||||
try:
|
||||
with open(json_conf_path) as f:
|
||||
return json.load(f)
|
||||
except BaseException:
|
||||
raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
|
||||
|
||||
|
||||
def dump_json_conf(config_data, conf_path):
|
||||
if os.path.isabs(conf_path):
|
||||
json_conf_path = conf_path
|
||||
else:
|
||||
json_conf_path = os.path.join(get_project_base_directory(), conf_path)
|
||||
try:
|
||||
with open(json_conf_path, "w") as f:
|
||||
json.dump(config_data, f, indent=4)
|
||||
except BaseException:
|
||||
raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
|
||||
|
||||
|
||||
def load_json_conf_real_time(conf_path):
|
||||
if os.path.isabs(conf_path):
|
||||
json_conf_path = conf_path
|
||||
else:
|
||||
json_conf_path = os.path.join(get_project_base_directory(), conf_path)
|
||||
try:
|
||||
with open(json_conf_path) as f:
|
||||
return json.load(f)
|
||||
except BaseException:
|
||||
raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
|
||||
|
||||
|
||||
def load_yaml_conf(conf_path):
|
||||
if not os.path.isabs(conf_path):
|
||||
conf_path = os.path.join(get_project_base_directory(), conf_path)
|
||||
try:
|
||||
with open(conf_path) as f:
|
||||
yaml = YAML(typ="safe", pure=True)
|
||||
return yaml.load(f)
|
||||
except Exception as e:
|
||||
raise EnvironmentError("loading yaml file config from {} failed:".format(conf_path), e)
|
||||
|
||||
|
||||
def rewrite_yaml_conf(conf_path, config):
|
||||
if not os.path.isabs(conf_path):
|
||||
conf_path = os.path.join(get_project_base_directory(), conf_path)
|
||||
try:
|
||||
with open(conf_path, "w") as f:
|
||||
yaml = YAML(typ="safe")
|
||||
yaml.dump(config, f)
|
||||
except Exception as e:
|
||||
raise EnvironmentError("rewrite yaml file config {} failed:".format(conf_path), e)
|
||||
|
||||
|
||||
def rewrite_json_file(filepath, json_data):
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(json_data, f, indent=4, separators=(",", ": "))
|
||||
f.close()
|
||||
|
||||
|
||||
def filename_type(filename):
|
||||
filename = filename.lower()
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
return FileType.PDF.value
|
||||
|
||||
if re.match(r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
|
||||
return FileType.DOC.value
|
||||
|
||||
if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
|
||||
return FileType.AURAL.value
|
||||
|
||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
||||
return FileType.VISUAL.value
|
||||
|
||||
return FileType.OTHER.value
|
||||
|
||||
|
||||
def thumbnail_img(filename, blob):
|
||||
"""
|
||||
MySQL LongText max length is 65535
|
||||
"""
|
||||
filename = filename.lower()
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
|
||||
buffered = BytesIO()
|
||||
resolution = 32
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
|
||||
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000 and resolution >= 2:
|
||||
resolution = resolution / 2
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
pdf.close()
|
||||
return img
|
||||
|
||||
elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
||||
image = Image.open(BytesIO(blob))
|
||||
image.thumbnail((30, 30))
|
||||
buffered = BytesIO()
|
||||
image.save(buffered, format="png")
|
||||
return buffered.getvalue()
|
||||
|
||||
elif re.match(r".*\.(ppt|pptx)$", filename):
|
||||
import aspose.pydrawing as drawing
|
||||
import aspose.slides as slides
|
||||
|
||||
try:
|
||||
with slides.Presentation(BytesIO(blob)) as presentation:
|
||||
buffered = BytesIO()
|
||||
scale = 0.03
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
|
||||
presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000:
|
||||
scale = scale / 2.0
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
return img
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def thumbnail(filename, blob):
|
||||
img = thumbnail_img(filename, blob)
|
||||
if img is not None:
|
||||
return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def traversal_files(base):
|
||||
for root, ds, fs in os.walk(base):
|
||||
for f in fs:
|
||||
fullname = os.path.join(root, f)
|
||||
yield fullname
|
||||
|
||||
|
||||
def repair_pdf_with_ghostscript(input_bytes):
|
||||
if shutil.which("gs") is None:
|
||||
return input_bytes
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
|
||||
temp_in.write(input_bytes)
|
||||
temp_in.flush()
|
||||
|
||||
cmd = [
|
||||
"gs",
|
||||
"-o",
|
||||
temp_out.name,
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dPDFSETTINGS=/prepress",
|
||||
temp_in.name,
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if proc.returncode != 0:
|
||||
return input_bytes
|
||||
except Exception:
|
||||
return input_bytes
|
||||
|
||||
temp_out.seek(0)
|
||||
repaired_bytes = temp_out.read()
|
||||
|
||||
return repaired_bytes
|
||||
|
||||
|
||||
def read_potential_broken_pdf(blob):
|
||||
def try_open(blob):
|
||||
try:
|
||||
with pdfplumber.open(BytesIO(blob)) as pdf:
|
||||
if pdf.pages:
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
if try_open(blob):
|
||||
return blob
|
||||
|
||||
repaired = repair_pdf_with_ghostscript(blob)
|
||||
if try_open(repaired):
|
||||
return repaired
|
||||
|
||||
return blob
|
||||
104
api/utils/health.py
Normal file
104
api/utils/health.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from api import settings
|
||||
from api.db.db_models import DB
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
def _ok_nok(ok: bool) -> str:
|
||||
return "ok" if ok else "nok"
|
||||
|
||||
|
||||
def check_db() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
# lightweight probe; works for MySQL/Postgres
|
||||
DB.execute_sql("SELECT 1")
|
||||
return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_redis() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
ok = bool(REDIS_CONN.health())
|
||||
return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_doc_engine() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
meta = settings.docStoreConn.health()
|
||||
# treat any successful call as ok
|
||||
return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_storage() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
STORAGE_IMPL.health()
|
||||
return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_chat() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
cfg = getattr(settings, "CHAT_CFG", None)
|
||||
ok = bool(cfg and cfg.get("factory"))
|
||||
return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def run_health_checks() -> tuple[dict, bool]:
|
||||
result: dict[str, str | dict] = {}
|
||||
|
||||
db_ok, db_meta = check_db()
|
||||
chat_ok, chat_meta = check_chat()
|
||||
|
||||
result["db"] = _ok_nok(db_ok)
|
||||
if not db_ok:
|
||||
result.setdefault("_meta", {})["db"] = db_meta
|
||||
|
||||
result["chat"] = _ok_nok(chat_ok)
|
||||
if not chat_ok:
|
||||
result.setdefault("_meta", {})["chat"] = chat_meta
|
||||
|
||||
# Optional probes (do not change minimal contract but exposed for observability)
|
||||
try:
|
||||
redis_ok, redis_meta = check_redis()
|
||||
result["redis"] = _ok_nok(redis_ok)
|
||||
if not redis_ok:
|
||||
result.setdefault("_meta", {})["redis"] = redis_meta
|
||||
except Exception:
|
||||
result["redis"] = "nok"
|
||||
|
||||
try:
|
||||
doc_ok, doc_meta = check_doc_engine()
|
||||
result["doc_engine"] = _ok_nok(doc_ok)
|
||||
if not doc_ok:
|
||||
result.setdefault("_meta", {})["doc_engine"] = doc_meta
|
||||
except Exception:
|
||||
result["doc_engine"] = "nok"
|
||||
|
||||
try:
|
||||
sto_ok, sto_meta = check_storage()
|
||||
result["storage"] = _ok_nok(sto_ok)
|
||||
if not sto_ok:
|
||||
result.setdefault("_meta", {})["storage"] = sto_meta
|
||||
except Exception:
|
||||
result["storage"] = "nok"
|
||||
|
||||
all_ok = (result.get("db") == "ok") and (result.get("chat") == "ok")
|
||||
result["status"] = "ok" if all_ok else "nok"
|
||||
return result, all_ok
|
||||
|
||||
|
||||
200
api/utils/health_utils.py
Normal file
200
api/utils/health_utils.py
Normal file
@@ -0,0 +1,200 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import os
|
||||
import requests
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from api import settings
|
||||
from api.db.db_models import DB
|
||||
from rag import settings as rag_settings
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from rag.utils.es_conn import ESConnection
|
||||
from rag.utils.infinity_conn import InfinityConnection
|
||||
|
||||
|
||||
def _ok_nok(ok: bool) -> str:
|
||||
return "ok" if ok else "nok"
|
||||
|
||||
|
||||
def check_db() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
# lightweight probe; works for MySQL/Postgres
|
||||
DB.execute_sql("SELECT 1")
|
||||
return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_redis() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
ok = bool(REDIS_CONN.health())
|
||||
return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_doc_engine() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
meta = settings.docStoreConn.health()
|
||||
# treat any successful call as ok
|
||||
return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def check_storage() -> tuple[bool, dict]:
|
||||
st = timer()
|
||||
try:
|
||||
STORAGE_IMPL.health()
|
||||
return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
|
||||
except Exception as e:
|
||||
return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
|
||||
|
||||
|
||||
def get_es_cluster_stats() -> dict:
|
||||
doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||
if doc_engine != 'elasticsearch':
|
||||
raise Exception("Elasticsearch is not in use.")
|
||||
try:
|
||||
return {
|
||||
"alive": True,
|
||||
"message": ESConnection().get_cluster_stats()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"alive": False,
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def get_infinity_status():
|
||||
doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||
if doc_engine != 'infinity':
|
||||
raise Exception("Infinity is not in use.")
|
||||
try:
|
||||
return {
|
||||
"alive": True,
|
||||
"message": InfinityConnection().health()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"alive": False,
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def get_mysql_status():
|
||||
try:
|
||||
cursor = DB.execute_sql("SHOW PROCESSLIST;")
|
||||
res_rows = cursor.fetchall()
|
||||
headers = ['id', 'user', 'host', 'db', 'command', 'time', 'state', 'info']
|
||||
cursor.close()
|
||||
return {
|
||||
"alive": True,
|
||||
"message": [dict(zip(headers, r)) for r in res_rows]
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"alive": False,
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def check_minio_alive():
|
||||
start_time = timer()
|
||||
try:
|
||||
response = requests.get(f'http://{rag_settings.MINIO["host"]}/minio/health/live')
|
||||
if response.status_code == 200:
|
||||
return {'alive': True, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
|
||||
else:
|
||||
return {'alive': False, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
|
||||
except Exception as e:
|
||||
return {
|
||||
"alive": False,
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def get_redis_info():
|
||||
try:
|
||||
return {
|
||||
"alive": True,
|
||||
"message": REDIS_CONN.info()
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"alive": False,
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def check_ragflow_server_alive():
|
||||
start_time = timer()
|
||||
try:
|
||||
response = requests.get(f'http://{settings.HOST_IP}:{settings.HOST_PORT}/v1/system/ping')
|
||||
if response.status_code == 200:
|
||||
return {'alive': True, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
|
||||
else:
|
||||
return {'alive': False, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
|
||||
except Exception as e:
|
||||
return {
|
||||
"alive": False,
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def run_health_checks() -> tuple[dict, bool]:
|
||||
result: dict[str, str | dict] = {}
|
||||
|
||||
db_ok, db_meta = check_db()
|
||||
result["db"] = _ok_nok(db_ok)
|
||||
if not db_ok:
|
||||
result.setdefault("_meta", {})["db"] = db_meta
|
||||
|
||||
try:
|
||||
redis_ok, redis_meta = check_redis()
|
||||
result["redis"] = _ok_nok(redis_ok)
|
||||
if not redis_ok:
|
||||
result.setdefault("_meta", {})["redis"] = redis_meta
|
||||
except Exception:
|
||||
result["redis"] = "nok"
|
||||
|
||||
try:
|
||||
doc_ok, doc_meta = check_doc_engine()
|
||||
result["doc_engine"] = _ok_nok(doc_ok)
|
||||
if not doc_ok:
|
||||
result.setdefault("_meta", {})["doc_engine"] = doc_meta
|
||||
except Exception:
|
||||
result["doc_engine"] = "nok"
|
||||
|
||||
try:
|
||||
sto_ok, sto_meta = check_storage()
|
||||
result["storage"] = _ok_nok(sto_ok)
|
||||
if not sto_ok:
|
||||
result.setdefault("_meta", {})["storage"] = sto_meta
|
||||
except Exception:
|
||||
result["storage"] = "nok"
|
||||
|
||||
|
||||
all_ok = (result.get("db") == "ok") and (result.get("redis") == "ok") and (result.get("doc_engine") == "ok") and (result.get("storage") == "ok")
|
||||
result["status"] = "ok" if all_ok else "nok"
|
||||
return result, all_ok
|
||||
|
||||
|
||||
78
api/utils/json.py
Normal file
78
api/utils/json.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import datetime
|
||||
import json
|
||||
from enum import Enum, IntEnum
|
||||
from api.utils.common import string_to_bytes, bytes_to_string
|
||||
|
||||
|
||||
class BaseType:
|
||||
def to_dict(self):
|
||||
return dict([(k.lstrip("_"), v) for k, v in self.__dict__.items()])
|
||||
|
||||
def to_dict_with_type(self):
|
||||
def _dict(obj):
|
||||
module = None
|
||||
if issubclass(obj.__class__, BaseType):
|
||||
data = {}
|
||||
for attr, v in obj.__dict__.items():
|
||||
k = attr.lstrip("_")
|
||||
data[k] = _dict(v)
|
||||
module = obj.__module__
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
data = []
|
||||
for i, vv in enumerate(obj):
|
||||
data.append(_dict(vv))
|
||||
elif isinstance(obj, dict):
|
||||
data = {}
|
||||
for _k, vv in obj.items():
|
||||
data[_k] = _dict(vv)
|
||||
else:
|
||||
data = obj
|
||||
return {"type": obj.__class__.__name__,
|
||||
"data": data, "module": module}
|
||||
|
||||
return _dict(self)
|
||||
|
||||
|
||||
class CustomJSONEncoder(json.JSONEncoder):
|
||||
def __init__(self, **kwargs):
|
||||
self._with_type = kwargs.pop("with_type", False)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def default(self, obj):
|
||||
if isinstance(obj, datetime.datetime):
|
||||
return obj.strftime('%Y-%m-%d %H:%M:%S')
|
||||
elif isinstance(obj, datetime.date):
|
||||
return obj.strftime('%Y-%m-%d')
|
||||
elif isinstance(obj, datetime.timedelta):
|
||||
return str(obj)
|
||||
elif issubclass(type(obj), Enum) or issubclass(type(obj), IntEnum):
|
||||
return obj.value
|
||||
elif isinstance(obj, set):
|
||||
return list(obj)
|
||||
elif issubclass(type(obj), BaseType):
|
||||
if not self._with_type:
|
||||
return obj.to_dict()
|
||||
else:
|
||||
return obj.to_dict_with_type()
|
||||
elif isinstance(obj, type):
|
||||
return obj.__name__
|
||||
else:
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
def json_dumps(src, byte=False, indent=None, with_type=False):
|
||||
dest = json.dumps(
|
||||
src,
|
||||
indent=indent,
|
||||
cls=CustomJSONEncoder,
|
||||
with_type=with_type)
|
||||
if byte:
|
||||
dest = string_to_bytes(dest)
|
||||
return dest
|
||||
|
||||
|
||||
def json_loads(src, object_hook=None, object_pairs_hook=None):
|
||||
if isinstance(src, bytes):
|
||||
src = bytes_to_string(src)
|
||||
return json.loads(src, object_hook=object_hook,
|
||||
object_pairs_hook=object_pairs_hook)
|
||||
91
api/utils/log_utils.py
Normal file
91
api/utils/log_utils.py
Normal file
@@ -0,0 +1,91 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import os
|
||||
import os.path
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
initialized_root_logger = False
|
||||
|
||||
def get_project_base_directory():
|
||||
PROJECT_BASE = os.path.abspath(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)),
|
||||
os.pardir,
|
||||
os.pardir,
|
||||
)
|
||||
)
|
||||
return PROJECT_BASE
|
||||
|
||||
def init_root_logger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"):
|
||||
global initialized_root_logger
|
||||
if initialized_root_logger:
|
||||
return
|
||||
initialized_root_logger = True
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.handlers.clear()
|
||||
log_path = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"{logfile_basename}.log"))
|
||||
|
||||
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
||||
formatter = logging.Formatter(log_format)
|
||||
|
||||
handler1 = RotatingFileHandler(log_path, maxBytes=10*1024*1024, backupCount=5)
|
||||
handler1.setFormatter(formatter)
|
||||
logger.addHandler(handler1)
|
||||
|
||||
handler2 = logging.StreamHandler()
|
||||
handler2.setFormatter(formatter)
|
||||
logger.addHandler(handler2)
|
||||
|
||||
logging.captureWarnings(True)
|
||||
|
||||
LOG_LEVELS = os.environ.get("LOG_LEVELS", "")
|
||||
pkg_levels = {}
|
||||
for pkg_name_level in LOG_LEVELS.split(","):
|
||||
terms = pkg_name_level.split("=")
|
||||
if len(terms)!= 2:
|
||||
continue
|
||||
pkg_name, pkg_level = terms[0], terms[1]
|
||||
pkg_name = pkg_name.strip()
|
||||
pkg_level = logging.getLevelName(pkg_level.strip().upper())
|
||||
if not isinstance(pkg_level, int):
|
||||
pkg_level = logging.INFO
|
||||
pkg_levels[pkg_name] = logging.getLevelName(pkg_level)
|
||||
|
||||
for pkg_name in ['peewee', 'pdfminer']:
|
||||
if pkg_name not in pkg_levels:
|
||||
pkg_levels[pkg_name] = logging.getLevelName(logging.WARNING)
|
||||
if 'root' not in pkg_levels:
|
||||
pkg_levels['root'] = logging.getLevelName(logging.INFO)
|
||||
|
||||
for pkg_name, pkg_level in pkg_levels.items():
|
||||
pkg_logger = logging.getLogger(pkg_name)
|
||||
pkg_logger.setLevel(pkg_level)
|
||||
|
||||
msg = f"{logfile_basename} log path: {log_path}, log levels: {pkg_levels}"
|
||||
logger.info(msg)
|
||||
|
||||
|
||||
def log_exception(e, *args):
|
||||
logging.exception(e)
|
||||
for a in args:
|
||||
if hasattr(a, "text"):
|
||||
logging.error(a.text)
|
||||
raise Exception(a.text)
|
||||
else:
|
||||
logging.error(str(a))
|
||||
raise e
|
||||
636
api/utils/validation_utils.py
Normal file
636
api/utils/validation_utils.py
Normal file
@@ -0,0 +1,636 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from collections import Counter
|
||||
from typing import Annotated, Any, Literal
|
||||
from uuid import UUID
|
||||
|
||||
from flask import Request
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
Field,
|
||||
StringConstraints,
|
||||
ValidationError,
|
||||
field_validator,
|
||||
)
|
||||
from pydantic_core import PydanticCustomError
|
||||
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
|
||||
|
||||
from api.constants import DATASET_NAME_LIMIT
|
||||
|
||||
|
||||
def validate_and_parse_json_request(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None, exclude_unset: bool = False) -> tuple[dict[str, Any] | None, str | None]:
|
||||
"""
|
||||
Validates and parses JSON requests through a multi-stage validation pipeline.
|
||||
|
||||
Implements a four-stage validation process:
|
||||
1. Content-Type verification (must be application/json)
|
||||
2. JSON syntax validation
|
||||
3. Payload structure type checking
|
||||
4. Pydantic model validation with error formatting
|
||||
|
||||
Args:
|
||||
request (Request): Flask request object containing HTTP payload
|
||||
validator (type[BaseModel]): Pydantic model class for data validation
|
||||
extras (dict[str, Any] | None): Additional fields to merge into payload
|
||||
before validation. These fields will be removed from the final output
|
||||
exclude_unset (bool): Whether to exclude fields that have not been explicitly set
|
||||
|
||||
Returns:
|
||||
tuple[Dict[str, Any] | None, str | None]:
|
||||
- First element:
|
||||
- Validated dictionary on success
|
||||
- None on validation failure
|
||||
- Second element:
|
||||
- None on success
|
||||
- Diagnostic error message on failure
|
||||
|
||||
Raises:
|
||||
UnsupportedMediaType: When Content-Type header is not application/json
|
||||
BadRequest: For structural JSON syntax errors
|
||||
ValidationError: When payload violates Pydantic schema rules
|
||||
|
||||
Examples:
|
||||
>>> validate_and_parse_json_request(valid_request, DatasetSchema)
|
||||
({"name": "Dataset1", "format": "csv"}, None)
|
||||
|
||||
>>> validate_and_parse_json_request(xml_request, DatasetSchema)
|
||||
(None, "Unsupported content type: Expected application/json, got text/xml")
|
||||
|
||||
>>> validate_and_parse_json_request(bad_json_request, DatasetSchema)
|
||||
(None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
|
||||
|
||||
Notes:
|
||||
1. Validation Priority:
|
||||
- Content-Type verification precedes JSON parsing
|
||||
- Structural validation occurs before schema validation
|
||||
2. Extra fields added via `extras` parameter are automatically removed
|
||||
from the final output after validation
|
||||
"""
|
||||
try:
|
||||
payload = request.get_json() or {}
|
||||
except UnsupportedMediaType:
|
||||
return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
|
||||
except BadRequest:
|
||||
return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
|
||||
|
||||
if not isinstance(payload, dict):
|
||||
return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
|
||||
|
||||
try:
|
||||
if extras is not None:
|
||||
payload.update(extras)
|
||||
validated_request = validator(**payload)
|
||||
except ValidationError as e:
|
||||
return None, format_validation_error_message(e)
|
||||
|
||||
parsed_payload = validated_request.model_dump(by_alias=True, exclude_unset=exclude_unset)
|
||||
|
||||
if extras is not None:
|
||||
for key in list(parsed_payload.keys()):
|
||||
if key in extras:
|
||||
del parsed_payload[key]
|
||||
|
||||
return parsed_payload, None
|
||||
|
||||
|
||||
def validate_and_parse_request_args(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None) -> tuple[dict[str, Any] | None, str | None]:
|
||||
"""
|
||||
Validates and parses request arguments against a Pydantic model.
|
||||
|
||||
This function performs a complete request validation workflow:
|
||||
1. Extracts query parameters from the request
|
||||
2. Merges with optional extra values (if provided)
|
||||
3. Validates against the specified Pydantic model
|
||||
4. Cleans the output by removing extra values
|
||||
5. Returns either parsed data or an error message
|
||||
|
||||
Args:
|
||||
request (Request): Web framework request object containing query parameters
|
||||
validator (type[BaseModel]): Pydantic model class for validation
|
||||
extras (dict[str, Any] | None): Optional additional values to include in validation
|
||||
but exclude from final output. Defaults to None.
|
||||
|
||||
Returns:
|
||||
tuple[dict[str, Any] | None, str | None]:
|
||||
- First element: Validated/parsed arguments as dict if successful, None otherwise
|
||||
- Second element: Formatted error message if validation failed, None otherwise
|
||||
|
||||
Behavior:
|
||||
- Query parameters are merged with extras before validation
|
||||
- Extras are automatically removed from the final output
|
||||
- All validation errors are formatted into a human-readable string
|
||||
|
||||
Raises:
|
||||
TypeError: If validator is not a Pydantic BaseModel subclass
|
||||
|
||||
Examples:
|
||||
Successful validation:
|
||||
>>> validate_and_parse_request_args(request, MyValidator)
|
||||
({'param1': 'value'}, None)
|
||||
|
||||
Failed validation:
|
||||
>>> validate_and_parse_request_args(request, MyValidator)
|
||||
(None, "param1: Field required")
|
||||
|
||||
With extras:
|
||||
>>> validate_and_parse_request_args(request, MyValidator, extras={'internal_id': 123})
|
||||
({'param1': 'value'}, None) # internal_id removed from output
|
||||
|
||||
Notes:
|
||||
- Uses request.args.to_dict() for Flask-compatible parameter extraction
|
||||
- Maintains immutability of original request arguments
|
||||
- Preserves type conversion from Pydantic validation
|
||||
"""
|
||||
args = request.args.to_dict(flat=True)
|
||||
try:
|
||||
if extras is not None:
|
||||
args.update(extras)
|
||||
validated_args = validator(**args)
|
||||
except ValidationError as e:
|
||||
return None, format_validation_error_message(e)
|
||||
|
||||
parsed_args = validated_args.model_dump()
|
||||
if extras is not None:
|
||||
for key in list(parsed_args.keys()):
|
||||
if key in extras:
|
||||
del parsed_args[key]
|
||||
|
||||
return parsed_args, None
|
||||
|
||||
|
||||
def format_validation_error_message(e: ValidationError) -> str:
|
||||
"""
|
||||
Formats validation errors into a standardized string format.
|
||||
|
||||
Processes pydantic ValidationError objects to create human-readable error messages
|
||||
containing field locations, error descriptions, and input values.
|
||||
|
||||
Args:
|
||||
e (ValidationError): The validation error instance containing error details
|
||||
|
||||
Returns:
|
||||
str: Formatted error messages joined by newlines. Each line contains:
|
||||
- Field path (dot-separated)
|
||||
- Error message
|
||||
- Truncated input value (max 128 chars)
|
||||
|
||||
Example:
|
||||
>>> try:
|
||||
... UserModel(name=123, email="invalid")
|
||||
... except ValidationError as e:
|
||||
... print(format_validation_error_message(e))
|
||||
Field: <name> - Message: <Input should be a valid string> - Value: <123>
|
||||
Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
|
||||
"""
|
||||
error_messages = []
|
||||
|
||||
for error in e.errors():
|
||||
field = ".".join(map(str, error["loc"]))
|
||||
msg = error["msg"]
|
||||
input_val = error["input"]
|
||||
input_str = str(input_val)
|
||||
|
||||
if len(input_str) > 128:
|
||||
input_str = input_str[:125] + "..."
|
||||
|
||||
error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
|
||||
error_messages.append(error_msg)
|
||||
|
||||
return "\n".join(error_messages)
|
||||
|
||||
|
||||
def normalize_str(v: Any) -> Any:
|
||||
"""
|
||||
Normalizes string values to a standard format while preserving non-string inputs.
|
||||
|
||||
Performs the following transformations when input is a string:
|
||||
1. Trims leading/trailing whitespace (str.strip())
|
||||
2. Converts to lowercase (str.lower())
|
||||
|
||||
Non-string inputs are returned unchanged, making this function safe for mixed-type
|
||||
processing pipelines.
|
||||
|
||||
Args:
|
||||
v (Any): Input value to normalize. Accepts any Python object.
|
||||
|
||||
Returns:
|
||||
Any: Normalized string if input was string-type, original value otherwise.
|
||||
|
||||
Behavior Examples:
|
||||
String Input: " Admin " → "admin"
|
||||
Empty String: " " → "" (empty string)
|
||||
Non-String:
|
||||
- 123 → 123
|
||||
- None → None
|
||||
- ["User"] → ["User"]
|
||||
|
||||
Typical Use Cases:
|
||||
- Standardizing user input
|
||||
- Preparing data for case-insensitive comparison
|
||||
- Cleaning API parameters
|
||||
- Normalizing configuration values
|
||||
|
||||
Edge Cases:
|
||||
- Unicode whitespace is handled by str.strip()
|
||||
- Locale-independent lowercasing (str.lower())
|
||||
- Preserves falsy values (0, False, etc.)
|
||||
|
||||
Example:
|
||||
>>> normalize_str(" ReadOnly ")
|
||||
'readonly'
|
||||
>>> normalize_str(42)
|
||||
42
|
||||
"""
|
||||
if isinstance(v, str):
|
||||
stripped = v.strip()
|
||||
normalized = stripped.lower()
|
||||
return normalized
|
||||
return v
|
||||
|
||||
|
||||
def validate_uuid1_hex(v: Any) -> str:
|
||||
"""
|
||||
Validates and converts input to a UUID version 1 hexadecimal string.
|
||||
|
||||
This function performs strict validation and normalization:
|
||||
1. Accepts either UUID objects or UUID-formatted strings
|
||||
2. Verifies the UUID is version 1 (time-based)
|
||||
3. Returns the 32-character hexadecimal representation
|
||||
|
||||
Args:
|
||||
v (Any): Input value to validate. Can be:
|
||||
- UUID object (must be version 1)
|
||||
- String in UUID format (e.g. "550e8400-e29b-41d4-a716-446655440000")
|
||||
|
||||
Returns:
|
||||
str: 32-character lowercase hexadecimal string without hyphens
|
||||
Example: "550e8400e29b41d4a716446655440000"
|
||||
|
||||
Raises:
|
||||
PydanticCustomError: With code "invalid_UUID1_format" when:
|
||||
- Input is not a UUID object or valid UUID string
|
||||
- UUID version is not 1
|
||||
- String doesn't match UUID format
|
||||
|
||||
Examples:
|
||||
Valid cases:
|
||||
>>> validate_uuid1_hex("550e8400-e29b-41d4-a716-446655440000")
|
||||
'550e8400e29b41d4a716446655440000'
|
||||
>>> validate_uuid1_hex(UUID('550e8400-e29b-41d4-a716-446655440000'))
|
||||
'550e8400e29b41d4a716446655440000'
|
||||
|
||||
Invalid cases:
|
||||
>>> validate_uuid1_hex("not-a-uuid") # raises PydanticCustomError
|
||||
>>> validate_uuid1_hex(12345) # raises PydanticCustomError
|
||||
>>> validate_uuid1_hex(UUID(int=0)) # v4, raises PydanticCustomError
|
||||
|
||||
Notes:
|
||||
- Uses Python's built-in UUID parser for format validation
|
||||
- Version check prevents accidental use of other UUID versions
|
||||
- Hyphens in input strings are automatically removed in output
|
||||
"""
|
||||
try:
|
||||
uuid_obj = UUID(v) if isinstance(v, str) else v
|
||||
if uuid_obj.version != 1:
|
||||
raise PydanticCustomError("invalid_UUID1_format", "Must be a UUID1 format")
|
||||
return uuid_obj.hex
|
||||
except (AttributeError, ValueError, TypeError):
|
||||
raise PydanticCustomError("invalid_UUID1_format", "Invalid UUID1 format")
|
||||
|
||||
|
||||
class Base(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid", strict=True)
|
||||
|
||||
|
||||
class RaptorConfig(Base):
|
||||
use_raptor: Annotated[bool, Field(default=False)]
|
||||
prompt: Annotated[
|
||||
str,
|
||||
StringConstraints(strip_whitespace=True, min_length=1),
|
||||
Field(
|
||||
default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize."
|
||||
),
|
||||
]
|
||||
max_token: Annotated[int, Field(default=256, ge=1, le=2048)]
|
||||
threshold: Annotated[float, Field(default=0.1, ge=0.0, le=1.0)]
|
||||
max_cluster: Annotated[int, Field(default=64, ge=1, le=1024)]
|
||||
random_seed: Annotated[int, Field(default=0, ge=0)]
|
||||
|
||||
|
||||
class GraphragConfig(Base):
|
||||
use_graphrag: Annotated[bool, Field(default=False)]
|
||||
entity_types: Annotated[list[str], Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])]
|
||||
method: Annotated[Literal["light", "general"], Field(default="light")]
|
||||
community: Annotated[bool, Field(default=False)]
|
||||
resolution: Annotated[bool, Field(default=False)]
|
||||
|
||||
|
||||
class ParserConfig(Base):
|
||||
auto_keywords: Annotated[int, Field(default=0, ge=0, le=32)]
|
||||
auto_questions: Annotated[int, Field(default=0, ge=0, le=10)]
|
||||
chunk_token_num: Annotated[int, Field(default=512, ge=1, le=2048)]
|
||||
delimiter: Annotated[str, Field(default=r"\n", min_length=1)]
|
||||
graphrag: Annotated[GraphragConfig, Field(default_factory=lambda: GraphragConfig(use_graphrag=False))]
|
||||
html4excel: Annotated[bool, Field(default=False)]
|
||||
layout_recognize: Annotated[str, Field(default="DeepDOC")]
|
||||
raptor: Annotated[RaptorConfig, Field(default_factory=lambda: RaptorConfig(use_raptor=False))]
|
||||
tag_kb_ids: Annotated[list[str], Field(default_factory=list)]
|
||||
topn_tags: Annotated[int, Field(default=1, ge=1, le=10)]
|
||||
filename_embd_weight: Annotated[float | None, Field(default=0.1, ge=0.0, le=1.0)]
|
||||
task_page_size: Annotated[int | None, Field(default=None, ge=1)]
|
||||
pages: Annotated[list[list[int]] | None, Field(default=None)]
|
||||
|
||||
|
||||
class CreateDatasetReq(Base):
|
||||
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
|
||||
avatar: Annotated[str | None, Field(default=None, max_length=65535)]
|
||||
description: Annotated[str | None, Field(default=None, max_length=65535)]
|
||||
embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")]
|
||||
permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)]
|
||||
chunk_method: Annotated[
|
||||
Literal["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
|
||||
Field(default="naive", min_length=1, max_length=32, serialization_alias="parser_id"),
|
||||
]
|
||||
parser_config: Annotated[ParserConfig | None, Field(default=None)]
|
||||
|
||||
@field_validator("avatar", mode="after")
|
||||
@classmethod
|
||||
def validate_avatar_base64(cls, v: str | None) -> str | None:
|
||||
"""
|
||||
Validates Base64-encoded avatar string format and MIME type compliance.
|
||||
|
||||
Implements a three-stage validation workflow:
|
||||
1. MIME prefix existence check
|
||||
2. MIME type format validation
|
||||
3. Supported type verification
|
||||
|
||||
Args:
|
||||
v (str): Raw avatar field value
|
||||
|
||||
Returns:
|
||||
str: Validated Base64 string
|
||||
|
||||
Raises:
|
||||
PydanticCustomError: For structural errors in these cases:
|
||||
- Missing MIME prefix header
|
||||
- Invalid MIME prefix format
|
||||
- Unsupported image MIME type
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Valid case
|
||||
CreateDatasetReq(avatar="...")
|
||||
|
||||
# Invalid cases
|
||||
CreateDatasetReq(avatar="image/jpeg;base64,...") # Missing 'data:' prefix
|
||||
CreateDatasetReq(avatar="data:video/mp4;base64,...") # Unsupported MIME type
|
||||
```
|
||||
"""
|
||||
if v is None:
|
||||
return v
|
||||
|
||||
if "," in v:
|
||||
prefix, _ = v.split(",", 1)
|
||||
if not prefix.startswith("data:"):
|
||||
raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")
|
||||
|
||||
mime_type = prefix[5:].split(";")[0]
|
||||
supported_mime_types = ["image/jpeg", "image/png"]
|
||||
if mime_type not in supported_mime_types:
|
||||
raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})
|
||||
|
||||
return v
|
||||
else:
|
||||
raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
|
||||
|
||||
@field_validator("embedding_model", mode="before")
|
||||
@classmethod
|
||||
def normalize_embedding_model(cls, v: Any) -> Any:
|
||||
"""Normalize embedding model string by stripping whitespace"""
|
||||
if isinstance(v, str):
|
||||
return v.strip()
|
||||
return v
|
||||
|
||||
@field_validator("embedding_model", mode="after")
|
||||
@classmethod
|
||||
def validate_embedding_model(cls, v: str | None) -> str | None:
|
||||
"""
|
||||
Validates embedding model identifier format compliance.
|
||||
|
||||
Validation pipeline:
|
||||
1. Structural format verification
|
||||
2. Component non-empty check
|
||||
3. Value normalization
|
||||
|
||||
Args:
|
||||
v (str): Raw model identifier
|
||||
|
||||
Returns:
|
||||
str: Validated <model_name>@<provider> format
|
||||
|
||||
Raises:
|
||||
PydanticCustomError: For these violations:
|
||||
- Missing @ separator
|
||||
- Empty model_name/provider
|
||||
- Invalid component structure
|
||||
|
||||
Examples:
|
||||
Valid: "text-embedding-3-large@openai"
|
||||
Invalid: "invalid_model" (no @)
|
||||
Invalid: "@openai" (empty model_name)
|
||||
Invalid: "text-embedding-3-large@" (empty provider)
|
||||
"""
|
||||
if isinstance(v, str):
|
||||
if "@" not in v:
|
||||
raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")
|
||||
|
||||
components = v.split("@", 1)
|
||||
if len(components) != 2 or not all(components):
|
||||
raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")
|
||||
|
||||
model_name, provider = components
|
||||
if not model_name.strip() or not provider.strip():
|
||||
raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
|
||||
return v
|
||||
|
||||
# @field_validator("permission", mode="before")
|
||||
# @classmethod
|
||||
# def normalize_permission(cls, v: Any) -> Any:
|
||||
# return normalize_str(v)
|
||||
|
||||
@field_validator("parser_config", mode="before")
|
||||
@classmethod
|
||||
def normalize_empty_parser_config(cls, v: Any) -> Any:
|
||||
"""
|
||||
Normalizes empty parser configuration by converting empty dictionaries to None.
|
||||
|
||||
This validator ensures consistent handling of empty parser configurations across
|
||||
the application by converting empty dicts to None values.
|
||||
|
||||
Args:
|
||||
v (Any): Raw input value for the parser config field
|
||||
|
||||
Returns:
|
||||
Any: Returns None if input is an empty dict, otherwise returns the original value
|
||||
|
||||
Example:
|
||||
>>> normalize_empty_parser_config({})
|
||||
None
|
||||
|
||||
>>> normalize_empty_parser_config({"key": "value"})
|
||||
{"key": "value"}
|
||||
"""
|
||||
if v == {}:
|
||||
return None
|
||||
return v
|
||||
|
||||
@field_validator("parser_config", mode="after")
|
||||
@classmethod
|
||||
def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
|
||||
"""
|
||||
Validates serialized JSON length constraints for parser configuration.
|
||||
|
||||
Implements a two-stage validation workflow:
|
||||
1. Null check - bypass validation for empty configurations
|
||||
2. Model serialization - convert Pydantic model to JSON string
|
||||
3. Size verification - enforce maximum allowed payload size
|
||||
|
||||
Args:
|
||||
v (ParserConfig | None): Raw parser configuration object
|
||||
|
||||
Returns:
|
||||
ParserConfig | None: Validated configuration object
|
||||
|
||||
Raises:
|
||||
PydanticCustomError: When serialized JSON exceeds 65,535 characters
|
||||
"""
|
||||
if v is None:
|
||||
return None
|
||||
|
||||
if (json_str := v.model_dump_json()) and len(json_str) > 65535:
|
||||
raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
|
||||
return v
|
||||
|
||||
|
||||
class UpdateDatasetReq(CreateDatasetReq):
|
||||
dataset_id: Annotated[str, Field(...)]
|
||||
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(default="")]
|
||||
pagerank: Annotated[int, Field(default=0, ge=0, le=100)]
|
||||
|
||||
@field_validator("dataset_id", mode="before")
|
||||
@classmethod
|
||||
def validate_dataset_id(cls, v: Any) -> str:
|
||||
return validate_uuid1_hex(v)
|
||||
|
||||
|
||||
class DeleteReq(Base):
|
||||
ids: Annotated[list[str] | None, Field(...)]
|
||||
|
||||
@field_validator("ids", mode="after")
|
||||
@classmethod
|
||||
def validate_ids(cls, v_list: list[str] | None) -> list[str] | None:
|
||||
"""
|
||||
Validates and normalizes a list of UUID strings with None handling.
|
||||
|
||||
This post-processing validator performs:
|
||||
1. None input handling (pass-through)
|
||||
2. UUID version 1 validation for each list item
|
||||
3. Duplicate value detection
|
||||
4. Returns normalized UUID hex strings or None
|
||||
|
||||
Args:
|
||||
v_list (list[str] | None): Input list that has passed initial validation.
|
||||
Either a list of UUID strings or None.
|
||||
|
||||
Returns:
|
||||
list[str] | None:
|
||||
- None if input was None
|
||||
- List of normalized UUID hex strings otherwise:
|
||||
* 32-character lowercase
|
||||
* Valid UUID version 1
|
||||
* Unique within list
|
||||
|
||||
Raises:
|
||||
PydanticCustomError: With structured error details when:
|
||||
- "invalid_UUID1_format": Any string fails UUIDv1 validation
|
||||
- "duplicate_uuids": If duplicate IDs are detected
|
||||
|
||||
Validation Rules:
|
||||
- None input returns None
|
||||
- Empty list returns empty list
|
||||
- All non-None items must be valid UUIDv1
|
||||
- No duplicates permitted
|
||||
- Original order preserved
|
||||
|
||||
Examples:
|
||||
Valid cases:
|
||||
>>> validate_ids(None)
|
||||
None
|
||||
>>> validate_ids([])
|
||||
[]
|
||||
>>> validate_ids(["550e8400-e29b-41d4-a716-446655440000"])
|
||||
["550e8400e29b41d4a716446655440000"]
|
||||
|
||||
Invalid cases:
|
||||
>>> validate_ids(["invalid"])
|
||||
# raises PydanticCustomError(invalid_UUID1_format)
|
||||
>>> validate_ids(["550e...", "550e..."])
|
||||
# raises PydanticCustomError(duplicate_uuids)
|
||||
|
||||
Security Notes:
|
||||
- Validates UUID version to prevent version spoofing
|
||||
- Duplicate check prevents data injection
|
||||
- None handling maintains pipeline integrity
|
||||
"""
|
||||
if v_list is None:
|
||||
return None
|
||||
|
||||
ids_list = []
|
||||
for v in v_list:
|
||||
try:
|
||||
ids_list.append(validate_uuid1_hex(v))
|
||||
except PydanticCustomError as e:
|
||||
raise e
|
||||
|
||||
duplicates = [item for item, count in Counter(ids_list).items() if count > 1]
|
||||
if duplicates:
|
||||
duplicates_str = ", ".join(duplicates)
|
||||
raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
|
||||
|
||||
return ids_list
|
||||
|
||||
|
||||
class DeleteDatasetReq(DeleteReq): ...
|
||||
|
||||
|
||||
class BaseListReq(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
id: Annotated[str | None, Field(default=None)]
|
||||
name: Annotated[str | None, Field(default=None)]
|
||||
page: Annotated[int, Field(default=1, ge=1)]
|
||||
page_size: Annotated[int, Field(default=30, ge=1)]
|
||||
orderby: Annotated[Literal["create_time", "update_time"], Field(default="create_time")]
|
||||
desc: Annotated[bool, Field(default=True)]
|
||||
|
||||
@field_validator("id", mode="before")
|
||||
@classmethod
|
||||
def validate_id(cls, v: Any) -> str:
|
||||
return validate_uuid1_hex(v)
|
||||
|
||||
|
||||
class ListDatasetReq(BaseListReq): ...
|
||||
201
api/utils/web_utils.py
Normal file
201
api/utils/web_utils.py
Normal file
@@ -0,0 +1,201 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import base64
|
||||
import ipaddress
|
||||
import json
|
||||
import re
|
||||
import socket
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from api.apps import smtp_mail_server
|
||||
from flask_mail import Message
|
||||
from flask import render_template_string
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.expected_conditions import staleness_of
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
|
||||
|
||||
CONTENT_TYPE_MAP = {
|
||||
# Office
|
||||
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"doc": "application/msword",
|
||||
"pdf": "application/pdf",
|
||||
"csv": "text/csv",
|
||||
"xls": "application/vnd.ms-excel",
|
||||
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
# Text/code
|
||||
"txt": "text/plain",
|
||||
"py": "text/plain",
|
||||
"js": "text/plain",
|
||||
"java": "text/plain",
|
||||
"c": "text/plain",
|
||||
"cpp": "text/plain",
|
||||
"h": "text/plain",
|
||||
"php": "text/plain",
|
||||
"go": "text/plain",
|
||||
"ts": "text/plain",
|
||||
"sh": "text/plain",
|
||||
"cs": "text/plain",
|
||||
"kt": "text/plain",
|
||||
"sql": "text/plain",
|
||||
# Web
|
||||
"md": "text/markdown",
|
||||
"markdown": "text/markdown",
|
||||
"htm": "text/html",
|
||||
"html": "text/html",
|
||||
"json": "application/json",
|
||||
# Image formats
|
||||
"png": "image/png",
|
||||
"jpg": "image/jpeg",
|
||||
"jpeg": "image/jpeg",
|
||||
"gif": "image/gif",
|
||||
"bmp": "image/bmp",
|
||||
"tiff": "image/tiff",
|
||||
"tif": "image/tiff",
|
||||
"webp": "image/webp",
|
||||
"svg": "image/svg+xml",
|
||||
"ico": "image/x-icon",
|
||||
"avif": "image/avif",
|
||||
"heic": "image/heic",
|
||||
}
|
||||
|
||||
|
||||
def html2pdf(
|
||||
source: str,
|
||||
timeout: int = 2,
|
||||
install_driver: bool = True,
|
||||
print_options: dict = {},
|
||||
):
|
||||
result = __get_pdf_from_html(source, timeout, install_driver, print_options)
|
||||
return result
|
||||
|
||||
|
||||
def __send_devtools(driver, cmd, params={}):
|
||||
resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
|
||||
url = driver.command_executor._url + resource
|
||||
body = json.dumps({"cmd": cmd, "params": params})
|
||||
response = driver.command_executor._request("POST", url, body)
|
||||
|
||||
if not response:
|
||||
raise Exception(response.get("value"))
|
||||
|
||||
return response.get("value")
|
||||
|
||||
|
||||
def __get_pdf_from_html(path: str, timeout: int, install_driver: bool, print_options: dict):
|
||||
webdriver_options = Options()
|
||||
webdriver_prefs = {}
|
||||
webdriver_options.add_argument("--headless")
|
||||
webdriver_options.add_argument("--disable-gpu")
|
||||
webdriver_options.add_argument("--no-sandbox")
|
||||
webdriver_options.add_argument("--disable-dev-shm-usage")
|
||||
webdriver_options.experimental_options["prefs"] = webdriver_prefs
|
||||
|
||||
webdriver_prefs["profile.default_content_settings"] = {"images": 2}
|
||||
|
||||
if install_driver:
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=webdriver_options)
|
||||
else:
|
||||
driver = webdriver.Chrome(options=webdriver_options)
|
||||
|
||||
driver.get(path)
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(staleness_of(driver.find_element(by=By.TAG_NAME, value="html")))
|
||||
except TimeoutException:
|
||||
calculated_print_options = {
|
||||
"landscape": False,
|
||||
"displayHeaderFooter": False,
|
||||
"printBackground": True,
|
||||
"preferCSSPageSize": True,
|
||||
}
|
||||
calculated_print_options.update(print_options)
|
||||
result = __send_devtools(driver, "Page.printToPDF", calculated_print_options)
|
||||
driver.quit()
|
||||
return base64.b64decode(result["data"])
|
||||
|
||||
|
||||
def is_private_ip(ip: str) -> bool:
|
||||
try:
|
||||
ip_obj = ipaddress.ip_address(ip)
|
||||
return ip_obj.is_private
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url):
|
||||
return False
|
||||
parsed_url = urlparse(url)
|
||||
hostname = parsed_url.hostname
|
||||
|
||||
if not hostname:
|
||||
return False
|
||||
try:
|
||||
ip = socket.gethostbyname(hostname)
|
||||
if is_private_ip(ip):
|
||||
return False
|
||||
except socket.gaierror:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def safe_json_parse(data: str | dict) -> dict:
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
try:
|
||||
return json.loads(data) if data else {}
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return {}
|
||||
|
||||
|
||||
def get_float(req: dict, key: str, default: float | int = 10.0) -> float:
|
||||
try:
|
||||
parsed = float(req.get(key, default))
|
||||
return parsed if parsed > 0 else default
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
INVITE_EMAIL_TMPL = """
|
||||
<p>Hi {{email}},</p>
|
||||
<p>{{inviter}} has invited you to join their team (ID: {{tenant_id}}).</p>
|
||||
<p>Click the link below to complete your registration:<br>
|
||||
<a href="{{invite_url}}">{{invite_url}}</a></p>
|
||||
<p>If you did not request this, please ignore this email.</p>
|
||||
"""
|
||||
|
||||
def send_invite_email(to_email, invite_url, tenant_id, inviter):
|
||||
from api.apps import app
|
||||
with app.app_context():
|
||||
msg = Message(subject="RAGFlow Invitation",
|
||||
recipients=[to_email])
|
||||
msg.html = render_template_string(
|
||||
INVITE_EMAIL_TMPL,
|
||||
email=to_email,
|
||||
invite_url=invite_url,
|
||||
tenant_id=tenant_id,
|
||||
inviter=inviter,
|
||||
)
|
||||
smtp_mail_server.send(msg)
|
||||
Reference in New Issue
Block a user