将flask改成fastapi

2025-10-13 13:18:03 +08:00
commit 88db2539b0
476 changed files with 739741 additions and 0 deletions
--- a/api/utils/init.py
+++ b/api/utils/init.py
@@ -0,0 +1,132 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import base64
+import datetime
+import hashlib
+import os
+import socket
+import time
+import uuid
+import requests
+
+import importlib
+
+from .common import string_to_bytes
+
+
+def current_timestamp():
+    return int(time.time() * 1000)
+
+
+def timestamp_to_date(timestamp, format_string="%Y-%m-%d %H:%M:%S"):
+    if not timestamp:
+        timestamp = time.time()
+    timestamp = int(timestamp) / 1000
+    time_array = time.localtime(timestamp)
+    str_date = time.strftime(format_string, time_array)
+    return str_date
+
+
+def date_string_to_timestamp(time_str, format_string="%Y-%m-%d %H:%M:%S"):
+    time_array = time.strptime(time_str, format_string)
+    time_stamp = int(time.mktime(time_array) * 1000)
+    return time_stamp
+
+
+def get_lan_ip():
+    if os.name != "nt":
+        import fcntl
+        import struct
+
+        def get_interface_ip(ifname):
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            return socket.inet_ntoa(
+                fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', string_to_bytes(ifname[:15])))[20:24])
+
+    ip = socket.gethostbyname(socket.getfqdn())
+    if ip.startswith("127.") and os.name != "nt":
+        interfaces = [
+            "bond1",
+            "eth0",
+            "eth1",
+            "eth2",
+            "wlan0",
+            "wlan1",
+            "wifi0",
+            "ath0",
+            "ath1",
+            "ppp0",
+        ]
+        for ifname in interfaces:
+            try:
+                ip = get_interface_ip(ifname)
+                break
+            except IOError:
+                pass
+    return ip or ''
+
+
+def from_dict_hook(in_dict: dict):
+    if "type" in in_dict and "data" in in_dict:
+        if in_dict["module"] is None:
+            return in_dict["data"]
+        else:
+            return getattr(importlib.import_module(
+                in_dict["module"]), in_dict["type"])(**in_dict["data"])
+    else:
+        return in_dict
+
+
+def get_uuid():
+    return uuid.uuid1().hex
+
+
+def datetime_format(date_time: datetime.datetime) -> datetime.datetime:
+    return datetime.datetime(date_time.year, date_time.month, date_time.day,
+                             date_time.hour, date_time.minute, date_time.second)
+
+
+def get_format_time() -> datetime.datetime:
+    return datetime_format(datetime.datetime.now())
+
+
+def str2date(date_time: str):
+    return datetime.datetime.strptime(date_time, '%Y-%m-%d')
+
+
+def elapsed2time(elapsed):
+    seconds = elapsed / 1000
+    minuter, second = divmod(seconds, 60)
+    hour, minuter = divmod(minuter, 60)
+    return '%02d:%02d:%02d' % (hour, minuter, second)
+
+
+def download_img(url):
+    if not url:
+        return ""
+    response = requests.get(url)
+    return "data:" + \
+        response.headers.get('Content-Type', 'image/jpg') + ";" + \
+        "base64," + base64.b64encode(response.content).decode("utf-8")
+
+
+def delta_seconds(date_string: str):
+    dt = datetime.datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
+    return (datetime.datetime.now() - dt).total_seconds()
+
+
+def hash_str2int(line: str, mod: int = 10 ** 8) -> int:
+    return int(hashlib.sha1(line.encode("utf-8")).hexdigest(), 16) % mod
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
@@ -0,0 +1,873 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import asyncio
+import functools
+import json
+import logging
+import os
+import queue
+import random
+import threading
+import time
+from base64 import b64encode
+from copy import deepcopy
+from functools import wraps
+from hmac import HMAC
+from io import BytesIO
+from typing import Any, Callable, Coroutine, Optional, Type, Union
+from urllib.parse import quote, urlencode
+from uuid import uuid1
+
+import requests
+import trio
+# FastAPI imports
+from fastapi import Request, Response as FastAPIResponse, HTTPException, status
+from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
+from fastapi import Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from itsdangerous import URLSafeTimedSerializer
+from peewee import OperationalError
+from werkzeug.http import HTTP_STATUS_CODES
+
+from api import settings
+from api.constants import REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC
+from api.db import ActiveEnum
+from api.db.db_models import APIToken
+from api.db.services import UserService
+from api.db.services.llm_service import LLMService
+from api.db.services.tenant_llm_service import TenantLLMService
+from api.utils.json import CustomJSONEncoder, json_dumps
+
+# FastAPI 安全方案
+security = HTTPBearer()
+from api.utils import get_uuid
+from rag.utils.mcp_tool_call_conn import MCPToolCallSession, close_multiple_mcp_toolcall_sessions
+
+requests.models.complexjson.dumps = functools.partial(json.dumps, cls=CustomJSONEncoder)
+
+def serialize_for_json(obj):
+    """
+    Recursively serialize objects to make them JSON serializable.
+    Handles ModelMetaclass and other non-serializable objects.
+    """
+    if hasattr(obj, '__dict__'):
+        # For objects with __dict__, try to serialize their attributes
+        try:
+            return {key: serialize_for_json(value) for key, value in obj.__dict__.items() 
+                   if not key.startswith('_')}
+        except (AttributeError, TypeError):
+            return str(obj)
+    elif hasattr(obj, '__name__'):
+        # For classes and metaclasses, return their name
+        return f"<{obj.__module__}.{obj.__name__}>" if hasattr(obj, '__module__') else f"<{obj.__name__}>"
+    elif isinstance(obj, (list, tuple)):
+        return [serialize_for_json(item) for item in obj]
+    elif isinstance(obj, dict):
+        return {key: serialize_for_json(value) for key, value in obj.items()}
+    elif isinstance(obj, (str, int, float, bool)) or obj is None:
+        return obj
+    else:
+        # Fallback: convert to string representation
+        return str(obj)
+
+def request(**kwargs):
+    sess = requests.Session()
+    stream = kwargs.pop("stream", sess.stream)
+    timeout = kwargs.pop("timeout", None)
+    kwargs["headers"] = {k.replace("_", "-").upper(): v for k, v in kwargs.get("headers", {}).items()}
+    prepped = requests.Request(**kwargs).prepare()
+
+    if settings.CLIENT_AUTHENTICATION and settings.HTTP_APP_KEY and settings.SECRET_KEY:
+        timestamp = str(round(time() * 1000))
+        nonce = str(uuid1())
+        signature = b64encode(
+            HMAC(
+                settings.SECRET_KEY.encode("ascii"),
+                b"\n".join(
+                    [
+                        timestamp.encode("ascii"),
+                        nonce.encode("ascii"),
+                        settings.HTTP_APP_KEY.encode("ascii"),
+                        prepped.path_url.encode("ascii"),
+                        prepped.body if kwargs.get("json") else b"",
+                        urlencode(sorted(kwargs["data"].items()), quote_via=quote, safe="-._~").encode("ascii") if kwargs.get("data") and isinstance(kwargs["data"], dict) else b"",
+                    ]
+                ),
+                "sha1",
+            ).digest()
+        ).decode("ascii")
+
+        prepped.headers.update(
+            {
+                "TIMESTAMP": timestamp,
+                "NONCE": nonce,
+                "APP-KEY": settings.HTTP_APP_KEY,
+                "SIGNATURE": signature,
+            }
+        )
+
+    return sess.send(prepped, stream=stream, timeout=timeout)
+
+
+def get_exponential_backoff_interval(retries, full_jitter=False):
+    """Calculate the exponential backoff wait time."""
+    # Will be zero if factor equals 0
+    countdown = min(REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC * (2**retries))
+    # Full jitter according to
+    # https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
+    if full_jitter:
+        countdown = random.randrange(countdown + 1)
+    # Adjust according to maximum wait time and account for negative values.
+    return max(0, countdown)
+
+
+def get_data_error_result(code=settings.RetCode.DATA_ERROR, message="Sorry! Data missing!"):
+    logging.exception(Exception(message))
+    result_dict = {"code": code, "message": message}
+    response = {}
+    for key, value in result_dict.items():
+        if value is None and key != "code":
+            continue
+        else:
+            response[key] = value
+    return JSONResponse(content=response)
+
+
+def server_error_response(e):
+    logging.exception(e)
+    try:
+        if e.code == 401:
+            return get_json_result(code=401, message=repr(e))
+    except BaseException:
+        pass
+    if len(e.args) > 1:
+        try:
+            serialized_data = serialize_for_json(e.args[1])
+            return get_json_result(code= settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=serialized_data)
+        except Exception:
+            return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=None)
+    if repr(e).find("index_not_found_exception") >= 0:
+        return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message="No chunk found, please upload file and parse it.")
+
+    return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
+
+
+def error_response(response_code, message=None):
+    if message is None:
+        message = HTTP_STATUS_CODES.get(response_code, "Unknown Error")
+
+    return JSONResponse(
+        content={
+            "message": message,
+            "code": response_code,
+        },
+        status_code=response_code,
+    )
+
+
+# FastAPI 版本：使用 Pydantic 模型进行验证，而不是装饰器
+# 这个装饰器在 FastAPI 中不再需要，因为 FastAPI 会自动验证 Pydantic 模型
+def validate_request(*args, **kwargs):
+    """
+    废弃的装饰器：在 FastAPI 中使用 Pydantic 模型进行验证
+    这个函数保留是为了向后兼容，但不会执行任何验证
+    """
+    def wrapper(func):
+        @wraps(func)
+        def decorated_function(*_args, **_kwargs):
+            # FastAPI 中不需要手动验证，Pydantic 会自动处理
+            return func(*_args, **_kwargs)
+        return decorated_function
+    return wrapper
+
+
+def not_allowed_parameters(*params):
+    """
+    废弃的装饰器：在 FastAPI 中使用 Pydantic 模型进行验证
+    这个函数保留是为了向后兼容，但不会执行任何验证
+    """
+    def decorator(f):
+        def wrapper(*args, **kwargs):
+            # FastAPI 中不需要手动验证，Pydantic 会自动处理
+            return f(*args, **kwargs)
+        return wrapper
+    return decorator
+
+
+def active_required(f):
+    """
+    废弃的装饰器：在 FastAPI 中使用依赖注入进行用户验证
+    这个函数保留是为了向后兼容，但不会执行任何验证
+    """
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+        # FastAPI 中使用依赖注入进行用户验证
+        return f(*args, **kwargs)
+    return wrapper
+
+
+def is_localhost(ip):
+    return ip in {"127.0.0.1", "::1", "[::1]", "localhost"}
+
+
+def send_file_in_mem(data, filename):
+    """
+    发送内存中的文件数据
+    注意：在 FastAPI 中，这个函数需要接收 Request 参数来正确处理响应
+    """
+    if not isinstance(data, (str, bytes)):
+        data = json_dumps(data)
+    if isinstance(data, str):
+        data = data.encode("utf-8")
+
+    f = BytesIO()
+    f.write(data)
+    f.seek(0)
+
+    # 在 FastAPI 中，应该使用 FileResponse 或 StreamingResponse
+    # 这里返回文件对象，调用者需要处理响应
+    return f
+
+
+def get_json_result(code=settings.RetCode.SUCCESS, message="success", data=None):
+    response = {"code": code, "message": message, "data": data}
+    return JSONResponse(content=response)
+
+
+def apikey_required(func):
+    """
+    废弃的装饰器：在 FastAPI 中使用依赖注入进行 API Key 验证
+    这个函数保留是为了向后兼容，但不会执行任何验证
+    """
+    @wraps(func)
+    def decorated_function(*args, **kwargs):
+        # FastAPI 中使用依赖注入进行 API Key 验证
+        return func(*args, **kwargs)
+    return decorated_function
+
+
+def build_error_result(code=settings.RetCode.FORBIDDEN, message="success"):
+    response = {"code": code, "message": message}
+    return JSONResponse(content=response, status_code=code)
+
+
+def construct_response(code=settings.RetCode.SUCCESS, message="success", data=None, auth=None):
+    result_dict = {"code": code, "message": message, "data": data}
+    response_dict = {}
+    for key, value in result_dict.items():
+        if value is None and key != "code":
+            continue
+        else:
+            response_dict[key] = value
+    
+    headers = {
+        "Access-Control-Allow-Origin": "*",
+        "Access-Control-Allow-Method": "*",
+        "Access-Control-Allow-Headers": "*",
+        "Access-Control-Expose-Headers": "Authorization"
+    }
+    if auth:
+        headers["Authorization"] = auth
+    
+    return JSONResponse(content=response_dict, headers=headers)
+
+
+def construct_result(code=settings.RetCode.DATA_ERROR, message="data is missing"):
+    result_dict = {"code": code, "message": message}
+    response = {}
+    for key, value in result_dict.items():
+        if value is None and key != "code":
+            continue
+        else:
+            response[key] = value
+    return JSONResponse(content=response)
+
+
+def construct_json_result(code=settings.RetCode.SUCCESS, message="success", data=None):
+    if data is None:
+        return JSONResponse(content={"code": code, "message": message})
+    else:
+        return JSONResponse(content={"code": code, "message": message, "data": data})
+
+
+def construct_error_response(e):
+    logging.exception(e)
+    try:
+        if e.code == 401:
+            return construct_json_result(code=settings.RetCode.UNAUTHORIZED, message=repr(e))
+    except BaseException:
+        pass
+    if len(e.args) > 1:
+        return construct_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
+    return construct_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
+
+
+def token_required(func):
+    """
+    废弃的装饰器：在 FastAPI 中使用依赖注入进行 Token 验证
+    这个函数保留是为了向后兼容，但不会执行任何验证
+    """
+    @wraps(func)
+    def decorated_function(*args, **kwargs):
+        # FastAPI 中使用依赖注入进行 Token 验证
+        return func(*args, **kwargs)
+    return decorated_function
+
+
+def get_result(code=settings.RetCode.SUCCESS, message="", data=None):
+    if code == 0:
+        if data is not None:
+            response = {"code": code, "data": data}
+        else:
+            response = {"code": code}
+    else:
+        response = {"code": code, "message": message}
+    return JSONResponse(content=response)
+
+
+def get_error_data_result(
+    message="Sorry! Data missing!",
+    code=settings.RetCode.DATA_ERROR,
+):
+    result_dict = {"code": code, "message": message}
+    response = {}
+    for key, value in result_dict.items():
+        if value is None and key != "code":
+            continue
+        else:
+            response[key] = value
+    return JSONResponse(content=response)
+
+
+def get_error_argument_result(message="Invalid arguments"):
+    return get_result(code=settings.RetCode.ARGUMENT_ERROR, message=message)
+
+
+# FastAPI 依赖注入函数
+async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """获取当前用户 - FastAPI 版本"""
+    from api.db import StatusEnum
+    try:
+        jwt = URLSafeTimedSerializer(secret_key=settings.SECRET_KEY)
+        authorization = credentials.credentials
+        
+        if authorization:
+            try:
+                access_token = str(jwt.loads(authorization))
+                
+                if not access_token or not access_token.strip():
+                    raise HTTPException(
+                        status_code=status.HTTP_401_UNAUTHORIZED,
+                        detail="Authentication attempt with empty access token"
+                    )
+                
+                # Access tokens should be UUIDs (32 hex characters)
+                if len(access_token.strip()) < 32:
+                    raise HTTPException(
+                        status_code=status.HTTP_401_UNAUTHORIZED,
+                        detail=f"Authentication attempt with invalid token format: {len(access_token)} chars"
+                    )
+                
+                user = UserService.query(
+                    access_token=access_token, status=StatusEnum.VALID.value
+                )
+                if user:
+                    if not user[0].access_token or not user[0].access_token.strip():
+                        raise HTTPException(
+                            status_code=status.HTTP_401_UNAUTHORIZED,
+                            detail="Authentication attempt with empty access token"
+                        )
+                    return user[0]
+                else:
+                    raise HTTPException(
+                        status_code=status.HTTP_401_UNAUTHORIZED,
+                        detail="Authentication failed: Invalid access token"
+                    )
+            except Exception as e:
+                raise HTTPException(
+                    status_code=status.HTTP_401_UNAUTHORIZED,
+                    detail=f"Authentication failed: {str(e)}"
+                )
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                detail="Authentication failed: No authorization header"
+            )
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail=f"Authentication failed: {str(e)}"
+        )
+
+
+async def get_current_user_optional(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """获取当前用户（可选）- FastAPI 版本"""
+    try:
+        return await get_current_user(credentials)
+    except HTTPException:
+        return None
+
+
+async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """验证 API Key - FastAPI 版本"""
+    try:
+        token = credentials.credentials
+        objs = APIToken.query(token=token)
+        if not objs:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="API-KEY is invalid!"
+            )
+        return objs[0]
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail=f"API Key verification failed: {str(e)}"
+        )
+
+
+def create_file_response(data, filename: str, media_type: str = "application/octet-stream"):
+    """创建文件响应 - FastAPI 版本"""
+    if not isinstance(data, (str, bytes)):
+        data = json_dumps(data)
+    if isinstance(data, str):
+        data = data.encode("utf-8")
+    
+    return StreamingResponse(
+        BytesIO(data),
+        media_type=media_type,
+        headers={"Content-Disposition": f"attachment; filename={filename}"}
+    )
+
+
+def get_error_permission_result(message="Permission error"):
+    return get_result(code=settings.RetCode.PERMISSION_ERROR, message=message)
+
+
+def get_error_operating_result(message="Operating error"):
+    return get_result(code=settings.RetCode.OPERATING_ERROR, message=message)
+
+
+def generate_confirmation_token(tenant_id):
+    serializer = URLSafeTimedSerializer(tenant_id)
+    return "ragflow-" + serializer.dumps(get_uuid(), salt=tenant_id)[2:34]
+
+
+def get_parser_config(chunk_method, parser_config):
+    if not chunk_method:
+        chunk_method = "naive"
+
+    # Define default configurations for each chunking method
+    key_mapping = {
+        "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "tag": None,
+        "resume": None,
+        "manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "table": None,
+        "paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "one": None,
+        "knowledge_graph": {
+            "chunk_token_num": 8192,
+            "delimiter": r"\n",
+            "entity_types": ["organization", "person", "location", "event", "time"],
+            "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
+        },
+        "email": None,
+        "picture": None,
+    }
+
+    default_config = key_mapping[chunk_method]
+
+    # If no parser_config provided, return default
+    if not parser_config:
+        return default_config
+
+    # If parser_config is provided, merge with defaults to ensure required fields exist
+    if default_config is None:
+        return parser_config
+
+    # Ensure raptor and graphrag fields have default values if not provided
+    merged_config = deep_merge(default_config, parser_config)
+
+    return merged_config
+
+
+def get_data_openai(
+    id=None,
+    created=None,
+    model=None,
+    prompt_tokens=0,
+    completion_tokens=0,
+    content=None,
+    finish_reason=None,
+    object="chat.completion",
+    param=None,
+    stream=False
+):
+    total_tokens = prompt_tokens + completion_tokens
+
+    if stream:
+        return {
+            "id": f"{id}",
+            "object": "chat.completion.chunk",
+            "model": model,
+            "choices": [{
+                "delta": {"content": content},
+                "finish_reason": finish_reason,
+                "index": 0,
+            }],
+        }
+
+    return {
+        "id": f"{id}",
+        "object": object,
+        "created": int(time.time()) if created else None,
+        "model": model,
+        "param": param,
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "completion_tokens_details": {
+                "reasoning_tokens": 0,
+                "accepted_prediction_tokens": 0,
+                "rejected_prediction_tokens": 0,
+            },
+        },
+        "choices": [{
+            "message": {
+                "role": "assistant",
+                "content": content
+            },
+            "logprobs": None,
+            "finish_reason": finish_reason,
+            "index": 0,
+        }],
+    }
+
+
+def check_duplicate_ids(ids, id_type="item"):
+    """
+    Check for duplicate IDs in a list and return unique IDs and error messages.
+
+    Args:
+        ids (list): List of IDs to check for duplicates
+        id_type (str): Type of ID for error messages (e.g., 'document', 'dataset', 'chunk')
+
+    Returns:
+        tuple: (unique_ids, error_messages)
+            - unique_ids (list): List of unique IDs
+            - error_messages (list): List of error messages for duplicate IDs
+    """
+    id_count = {}
+    duplicate_messages = []
+
+    # Count occurrences of each ID
+    for id_value in ids:
+        id_count[id_value] = id_count.get(id_value, 0) + 1
+
+    # Check for duplicates
+    for id_value, count in id_count.items():
+        if count > 1:
+            duplicate_messages.append(f"Duplicate {id_type} ids: {id_value}")
+
+    # Return unique IDs and error messages
+    return list(set(ids)), duplicate_messages
+
+
+def verify_embedding_availability(embd_id: str, tenant_id: str) -> tuple[bool, JSONResponse | None]:
+    """
+    Verifies availability of an embedding model for a specific tenant.
+
+    Performs comprehensive verification through:
+    1. Identifier Parsing: Decomposes embd_id into name and factory components
+    2. System Verification: Checks model registration in LLMService
+    3. Tenant Authorization: Validates tenant-specific model assignments
+    4. Built-in Model Check: Confirms inclusion in predefined system models
+
+    Args:
+        embd_id (str): Unique identifier for the embedding model in format "model_name@factory"
+        tenant_id (str): Tenant identifier for access control
+
+    Returns:
+        tuple[bool, Response | None]:
+        - First element (bool):
+            - True: Model is available and authorized
+            - False: Validation failed
+        - Second element contains:
+            - None on success
+            - Error detail dict on failure
+
+    Raises:
+        ValueError: When model identifier format is invalid
+        OperationalError: When database connection fails (auto-handled)
+
+    Examples:
+        >>> verify_embedding_availability("text-embedding@openai", "tenant_123")
+        (True, None)
+
+        >>> verify_embedding_availability("invalid_model", "tenant_123")
+        (False, {'code': 101, 'message': "Unsupported model: <invalid_model>"})
+    """
+    try:
+        llm_name, llm_factory = TenantLLMService.split_model_name_and_factory(embd_id)
+        in_llm_service = bool(LLMService.query(llm_name=llm_name, fid=llm_factory, model_type="embedding"))
+
+        tenant_llms = TenantLLMService.get_my_llms(tenant_id=tenant_id)
+        is_tenant_model = any(llm["llm_name"] == llm_name and llm["llm_factory"] == llm_factory and llm["model_type"] == "embedding" for llm in tenant_llms)
+
+        is_builtin_model = embd_id in settings.BUILTIN_EMBEDDING_MODELS
+        if not (is_builtin_model or is_tenant_model or in_llm_service):
+            return False, get_error_argument_result(f"Unsupported model: <{embd_id}>")
+
+        if not (is_builtin_model or is_tenant_model):
+            return False, get_error_argument_result(f"Unauthorized model: <{embd_id}>")
+    except OperationalError as e:
+        logging.exception(e)
+        return False, get_error_data_result(message="Database operation failed")
+
+    return True, None
+
+
+def deep_merge(default: dict, custom: dict) -> dict:
+    """
+    Recursively merges two dictionaries with priority given to `custom` values.
+
+    Creates a deep copy of the `default` dictionary and iteratively merges nested
+    dictionaries using a stack-based approach. Non-dict values in `custom` will
+    completely override corresponding entries in `default`.
+
+    Args:
+        default (dict): Base dictionary containing default values.
+        custom (dict): Dictionary containing overriding values.
+
+    Returns:
+        dict: New merged dictionary combining values from both inputs.
+
+    Example:
+        >>> from copy import deepcopy
+        >>> default = {"a": 1, "nested": {"x": 10, "y": 20}}
+        >>> custom = {"b": 2, "nested": {"y": 99, "z": 30}}
+        >>> deep_merge(default, custom)
+        {'a': 1, 'b': 2, 'nested': {'x': 10, 'y': 99, 'z': 30}}
+
+        >>> deep_merge({"config": {"mode": "auto"}}, {"config": "manual"})
+        {'config': 'manual'}
+
+    Notes:
+        1. Merge priority is always given to `custom` values at all nesting levels
+        2. Non-dict values (e.g. list, str) in `custom` will replace entire values
+           in `default`, even if the original value was a dictionary
+        3. Time complexity: O(N) where N is total key-value pairs in `custom`
+        4. Recommended for configuration merging and nested data updates
+    """
+    merged = deepcopy(default)
+    stack = [(merged, custom)]
+
+    while stack:
+        base_dict, override_dict = stack.pop()
+
+        for key, val in override_dict.items():
+            if key in base_dict and isinstance(val, dict) and isinstance(base_dict[key], dict):
+                stack.append((base_dict[key], val))
+            else:
+                base_dict[key] = val
+
+    return merged
+
+
+def remap_dictionary_keys(source_data: dict, key_aliases: dict = None) -> dict:
+    """
+    Transform dictionary keys using a configurable mapping schema.
+
+    Args:
+        source_data: Original dictionary to process
+        key_aliases: Custom key transformation rules (Optional)
+            When provided, overrides default key mapping
+            Format: {<original_key>: <new_key>, ...}
+
+    Returns:
+        dict: New dictionary with transformed keys preserving original values
+
+    Example:
+        >>> input_data = {"old_key": "value", "another_field": 42}
+        >>> remap_dictionary_keys(input_data, {"old_key": "new_key"})
+        {'new_key': 'value', 'another_field': 42}
+    """
+    DEFAULT_KEY_MAP = {
+        "chunk_num": "chunk_count",
+        "doc_num": "document_count",
+        "parser_id": "chunk_method",
+        "embd_id": "embedding_model",
+    }
+
+    transformed_data = {}
+    mapping = key_aliases or DEFAULT_KEY_MAP
+
+    for original_key, value in source_data.items():
+        mapped_key = mapping.get(original_key, original_key)
+        transformed_data[mapped_key] = value
+
+    return transformed_data
+
+
+def group_by(list_of_dict, key):
+    res = {}
+    for item in list_of_dict:
+        if item[key] in res.keys():
+            res[item[key]].append(item)
+        else:
+            res[item[key]] = [item]
+    return res
+
+
+def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, str]:
+    results = {}
+    tool_call_sessions = []
+    try:
+        for mcp_server in mcp_servers:
+            server_key = mcp_server.id
+
+            cached_tools = mcp_server.variables.get("tools", {})
+
+            tool_call_session = MCPToolCallSession(mcp_server, mcp_server.variables)
+            tool_call_sessions.append(tool_call_session)
+
+            try:
+                tools = tool_call_session.get_tools(timeout)
+            except Exception:
+                tools = []
+
+            results[server_key] = []
+            for tool in tools:
+                tool_dict = tool.model_dump()
+                cached_tool = cached_tools.get(tool_dict["name"], {})
+
+                tool_dict["enabled"] = cached_tool.get("enabled", True)
+                results[server_key].append(tool_dict)
+
+        # PERF: blocking call to close sessions — consider moving to background thread or task queue
+        close_multiple_mcp_toolcall_sessions(tool_call_sessions)
+        return results, ""
+    except Exception as e:
+        return {}, str(e)
+
+
+TimeoutException = Union[Type[BaseException], BaseException]
+OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
+
+
+def timeout(seconds: float | int | str = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
+    if isinstance(seconds, str):
+        seconds = float(seconds)
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            result_queue = queue.Queue(maxsize=1)
+
+            def target():
+                try:
+                    result = func(*args, **kwargs)
+                    result_queue.put(result)
+                except Exception as e:
+                    result_queue.put(e)
+
+            thread = threading.Thread(target=target)
+            thread.daemon = True
+            thread.start()
+
+            for a in range(attempts):
+                try:
+                    if os.environ.get("ENABLE_TIMEOUT_ASSERTION"):
+                        result = result_queue.get(timeout=seconds)
+                    else:
+                        result = result_queue.get()
+                    if isinstance(result, Exception):
+                        raise result
+                    return result
+                except queue.Empty:
+                    pass
+            raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds and {attempts} attempts.")
+
+        @wraps(func)
+        async def async_wrapper(*args, **kwargs) -> Any:
+            if seconds is None:
+                return await func(*args, **kwargs)
+
+            for a in range(attempts):
+                try:
+                    if os.environ.get("ENABLE_TIMEOUT_ASSERTION"):
+                        with trio.fail_after(seconds):
+                            return await func(*args, **kwargs)
+                    else:
+                        return await func(*args, **kwargs)
+                except trio.TooSlowError:
+                    if a < attempts - 1:
+                        continue
+                    if on_timeout is not None:
+                        if callable(on_timeout):
+                            result = on_timeout()
+                            if isinstance(result, Coroutine):
+                                return await result
+                            return result
+                        return on_timeout
+
+                    if exception is None:
+                        raise TimeoutError(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
+
+                    if isinstance(exception, BaseException):
+                        raise exception
+
+                    if isinstance(exception, type) and issubclass(exception, BaseException):
+                        raise exception(f"Operation timed out after {seconds} seconds and {attempts} attempts.")
+
+                    raise RuntimeError("Invalid exception type provided")
+
+        if asyncio.iscoroutinefunction(func):
+            return async_wrapper
+        return wrapper
+
+    return decorator
+
+
+async def is_strong_enough(chat_model, embedding_model):
+    count = settings.STRONG_TEST_COUNT
+    if not chat_model or not embedding_model:
+        return
+    if isinstance(count, int) and count <= 0:
+        return
+
+    @timeout(60, 2)
+    async def _is_strong_enough():
+        nonlocal chat_model, embedding_model
+        if embedding_model:
+            with trio.fail_after(10):
+                _ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
+        if chat_model:
+            with trio.fail_after(30):
+                res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {}))
+            if res.find("**ERROR**") >= 0:
+                raise Exception(res)
+
+    # Pressure test for GraphRAG task
+    async with trio.open_nursery() as nursery:
+        for _ in range(count):
+            nursery.start_soon(_is_strong_enough)
--- a/api/utils/base64_image.py
+++ b/api/utils/base64_image.py
@@ -0,0 +1,56 @@
+import base64
+import logging
+from functools import partial
+from io import BytesIO
+
+from PIL import Image
+
+test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAIAAAD/gAIDAAAA6ElEQVR4nO3QwQ3AIBDAsIP9d25XIC+EZE8QZc18w5l9O+AlZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBT+IYAHHLHkdEgAAAABJRU5ErkJggg=="
+test_image = base64.b64decode(test_image_base64)
+
+
+async def image2id(d: dict, storage_put_func: partial, objname:str, bucket:str="imagetemps"):
+    import logging
+    from io import BytesIO
+    import trio
+    from rag.svr.task_executor import minio_limiter
+    if not d.get("image"):
+        return
+
+    with BytesIO() as output_buffer:
+        if isinstance(d["image"], bytes):
+            output_buffer.write(d["image"])
+            output_buffer.seek(0)
+        else:
+            # If the image is in RGBA mode, convert it to RGB mode before saving it in JPEG format.
+            if d["image"].mode in ("RGBA", "P"):
+                converted_image = d["image"].convert("RGB")
+                d["image"] = converted_image
+            try:
+                d["image"].save(output_buffer, format='JPEG')
+            except OSError as e:
+                logging.warning(
+                    "Saving image exception, ignore: {}".format(str(e)))
+
+        async with minio_limiter:
+            await trio.to_thread.run_sync(lambda: storage_put_func(bucket=bucket, fnm=objname, binary=output_buffer.getvalue()))
+        d["img_id"] = f"{bucket}-{objname}"
+        if not isinstance(d["image"], bytes):
+            d["image"].close()
+        del d["image"]  # Remove image reference
+
+
+def id2image(image_id:str|None, storage_get_func: partial):
+    if not image_id:
+        return
+    arr = image_id.split("-")
+    if len(arr) != 2:
+        return
+    bkt, nm = image_id.split("-")
+    try:
+        blob = storage_get_func(bucket=bkt, filename=nm)
+        if not blob:
+            return
+        return Image.open(BytesIO(blob))
+    except Exception as e:
+        logging.exception(e)
--- a/api/utils/commands.py
+++ b/api/utils/commands.py
@@ -0,0 +1,78 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import base64
+import click
+import re
+
+from flask import Flask
+from werkzeug.security import generate_password_hash
+
+from api.db.services import UserService
+
+
+@click.command('reset-password', help='Reset the account password.')
+@click.option('--email', prompt=True, help='The email address of the account whose password you need to reset')
+@click.option('--new-password', prompt=True, help='the new password.')
+@click.option('--password-confirm', prompt=True, help='the new password confirm.')
+def reset_password(email, new_password, password_confirm):
+    if str(new_password).strip() != str(password_confirm).strip():
+        click.echo(click.style('sorry. The two passwords do not match.', fg='red'))
+        return
+    user = UserService.query(email=email)
+    if not user:
+        click.echo(click.style('sorry. The Email is not registered!.', fg='red'))
+        return
+    encode_password = base64.b64encode(new_password.encode('utf-8')).decode('utf-8')
+    password_hash = generate_password_hash(encode_password)
+    user_dict = {
+        'password': password_hash
+    }
+    UserService.update_user(user[0].id,user_dict)
+    click.echo(click.style('Congratulations! Password has been reset.', fg='green'))
+
+
+@click.command('reset-email', help='Reset the account email.')
+@click.option('--email', prompt=True, help='The old email address of the account whose email you need to reset')
+@click.option('--new-email', prompt=True, help='the new email.')
+@click.option('--email-confirm', prompt=True, help='the new email confirm.')
+def reset_email(email, new_email, email_confirm):
+    if str(new_email).strip() != str(email_confirm).strip():
+        click.echo(click.style('Sorry, new email and confirm email do not match.', fg='red'))
+        return
+    if str(new_email).strip() == str(email).strip():
+        click.echo(click.style('Sorry, new email and old email are the same.', fg='red'))
+        return
+    user = UserService.query(email=email)
+    if not user:
+        click.echo(click.style('sorry. the account: [{}] not exist .'.format(email), fg='red'))
+        return
+    if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,4}$", new_email):
+        click.echo(click.style('sorry. {} is not a valid email. '.format(new_email), fg='red'))
+        return
+    new_user = UserService.query(email=new_email)
+    if new_user:
+        click.echo(click.style('sorry. the account: [{}] is exist .'.format(new_email), fg='red'))
+        return
+    user_dict = {
+        'email': new_email
+    }
+    UserService.update_user(user[0].id,user_dict)
+    click.echo(click.style('Congratulations!, email has been reset.', fg='green'))
+
+def register_commands(app: Flask):
+    app.cli.add_command(reset_password)
+    app.cli.add_command(reset_email)
--- a/api/utils/common.py
+++ b/api/utils/common.py
@@ -0,0 +1,46 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+def string_to_bytes(string):
+    return string if isinstance(
+        string, bytes) else string.encode(encoding="utf-8")
+
+
+def bytes_to_string(byte):
+    return byte.decode(encoding="utf-8")
+
+
+def convert_bytes(size_in_bytes: int) -> str:
+    """
+    Format size in bytes.
+    """
+    if size_in_bytes == 0:
+        return "0 B"
+
+    units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
+    i = 0
+    size = float(size_in_bytes)
+
+    while size >= 1024 and i < len(units) - 1:
+        size /= 1024
+        i += 1
+
+    if i == 0 or size >= 100:
+        return f"{size:.0f} {units[i]}"
+    elif size >= 10:
+        return f"{size:.1f} {units[i]}"
+    else:
+        return f"{size:.2f} {units[i]}"
--- a/api/utils/configs.py
+++ b/api/utils/configs.py
@@ -0,0 +1,179 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import io
+import copy
+import logging
+import base64
+import pickle
+import importlib
+
+from api.utils import file_utils
+from filelock import FileLock
+from api.utils.common import bytes_to_string, string_to_bytes
+from api.constants import SERVICE_CONF
+
+
+def conf_realpath(conf_name):
+    conf_path = f"conf/{conf_name}"
+    return os.path.join(file_utils.get_project_base_directory(), conf_path)
+
+
+def read_config(conf_name=SERVICE_CONF):
+    local_config = {}
+    local_path = conf_realpath(f'local.{conf_name}')
+
+    # load local config file
+    if os.path.exists(local_path):
+        local_config = file_utils.load_yaml_conf(local_path)
+        if not isinstance(local_config, dict):
+            raise ValueError(f'Invalid config file: "{local_path}".')
+
+    global_config_path = conf_realpath(conf_name)
+    global_config = file_utils.load_yaml_conf(global_config_path)
+
+    if not isinstance(global_config, dict):
+        raise ValueError(f'Invalid config file: "{global_config_path}".')
+
+    global_config.update(local_config)
+    return global_config
+
+
+CONFIGS = read_config()
+
+
+def show_configs():
+    msg = f"Current configs, from {conf_realpath(SERVICE_CONF)}:"
+    for k, v in CONFIGS.items():
+        if isinstance(v, dict):
+            if "password" in v:
+                v = copy.deepcopy(v)
+                v["password"] = "*" * 8
+            if "access_key" in v:
+                v = copy.deepcopy(v)
+                v["access_key"] = "*" * 8
+            if "secret_key" in v:
+                v = copy.deepcopy(v)
+                v["secret_key"] = "*" * 8
+            if "secret" in v:
+                v = copy.deepcopy(v)
+                v["secret"] = "*" * 8
+            if "sas_token" in v:
+                v = copy.deepcopy(v)
+                v["sas_token"] = "*" * 8
+            if "oauth" in k:
+                v = copy.deepcopy(v)
+                for key, val in v.items():
+                    if "client_secret" in val:
+                        val["client_secret"] = "*" * 8
+            if "authentication" in k:
+                v = copy.deepcopy(v)
+                for key, val in v.items():
+                    if "http_secret_key" in val:
+                        val["http_secret_key"] = "*" * 8
+        msg += f"\n\t{k}: {v}"
+    logging.info(msg)
+
+
+def get_base_config(key, default=None):
+    if key is None:
+        return None
+    if default is None:
+        default = os.environ.get(key.upper())
+    return CONFIGS.get(key, default)
+
+
+def decrypt_database_password(password):
+    encrypt_password = get_base_config("encrypt_password", False)
+    encrypt_module = get_base_config("encrypt_module", False)
+    private_key = get_base_config("private_key", None)
+
+    if not password or not encrypt_password:
+        return password
+
+    if not private_key:
+        raise ValueError("No private key")
+
+    module_fun = encrypt_module.split("#")
+    pwdecrypt_fun = getattr(
+        importlib.import_module(
+            module_fun[0]),
+        module_fun[1])
+
+    return pwdecrypt_fun(private_key, password)
+
+
+def decrypt_database_config(
+        database=None, passwd_key="password", name="database"):
+    if not database:
+        database = get_base_config(name, {})
+
+    database[passwd_key] = decrypt_database_password(database[passwd_key])
+    return database
+
+
+def update_config(key, value, conf_name=SERVICE_CONF):
+    conf_path = conf_realpath(conf_name=conf_name)
+    if not os.path.isabs(conf_path):
+        conf_path = os.path.join(
+            file_utils.get_project_base_directory(), conf_path)
+
+    with FileLock(os.path.join(os.path.dirname(conf_path), ".lock")):
+        config = file_utils.load_yaml_conf(conf_path=conf_path) or {}
+        config[key] = value
+        file_utils.rewrite_yaml_conf(conf_path=conf_path, config=config)
+
+
+safe_module = {
+    'numpy',
+    'rag_flow'
+}
+
+
+class RestrictedUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        import importlib
+        if module.split('.')[0] in safe_module:
+            _module = importlib.import_module(module)
+            return getattr(_module, name)
+        # Forbid everything else.
+        raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
+                                     (module, name))
+
+
+def restricted_loads(src):
+    """Helper function analogous to pickle.loads()."""
+    return RestrictedUnpickler(io.BytesIO(src)).load()
+
+
+def serialize_b64(src, to_str=False):
+    dest = base64.b64encode(pickle.dumps(src))
+    if not to_str:
+        return dest
+    else:
+        return bytes_to_string(dest)
+
+
+def deserialize_b64(src):
+    src = base64.b64decode(
+        string_to_bytes(src) if isinstance(
+            src, str) else src)
+    use_deserialize_safe_module = get_base_config(
+        'use_deserialize_safe_module', False)
+    if use_deserialize_safe_module:
+        return restricted_loads(src)
+    return pickle.loads(src)
--- a/api/utils/crypt.py
+++ b/api/utils/crypt.py
@@ -0,0 +1,64 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import base64
+import os
+import sys
+from Cryptodome.PublicKey import RSA
+from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5
+from api.utils import file_utils
+
+
+def crypt(line):
+    """
+    decrypt(crypt(input_string)) == base64(input_string), which frontend and admin_client use.
+    """
+    file_path = os.path.join(file_utils.get_project_base_directory(), "conf", "public.pem")
+    rsa_key = RSA.importKey(open(file_path).read(), "Welcome")
+    cipher = Cipher_pkcs1_v1_5.new(rsa_key)
+    password_base64 = base64.b64encode(line.encode('utf-8')).decode("utf-8")
+    encrypted_password = cipher.encrypt(password_base64.encode())
+    return base64.b64encode(encrypted_password).decode('utf-8')
+
+
+def decrypt(line):
+    file_path = os.path.join(file_utils.get_project_base_directory(), "conf", "private.pem")
+    rsa_key = RSA.importKey(open(file_path).read(), "Welcome")
+    cipher = Cipher_pkcs1_v1_5.new(rsa_key)
+    return cipher.decrypt(base64.b64decode(line), "Fail to decrypt password!").decode('utf-8')
+
+
+def decrypt2(crypt_text):
+    from base64 import b64decode, b16decode
+    from Crypto.Cipher import PKCS1_v1_5 as Cipher_PKCS1_v1_5
+    from Crypto.PublicKey import RSA
+    decode_data = b64decode(crypt_text)
+    if len(decode_data) == 127:
+        hex_fixed = '00' + decode_data.hex()
+        decode_data = b16decode(hex_fixed.upper())
+
+    file_path = os.path.join(file_utils.get_project_base_directory(), "conf", "private.pem")
+    pem = open(file_path).read()
+    rsa_key = RSA.importKey(pem, "Welcome")
+    cipher = Cipher_PKCS1_v1_5.new(rsa_key)
+    decrypt_text = cipher.decrypt(decode_data, None)
+    return (b64decode(decrypt_text)).decode()
+
+
+if __name__ == "__main__":
+    passwd = crypt(sys.argv[1])
+    print(passwd)
+    print(decrypt(passwd))
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@@ -0,0 +1,286 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import base64
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+from io import BytesIO
+
+import pdfplumber
+from cachetools import LRUCache, cached
+from PIL import Image
+from ruamel.yaml import YAML
+
+from api.constants import IMG_BASE64_PREFIX
+from api.db import FileType
+
+PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
+RAG_BASE = os.getenv("RAG_BASE")
+
+LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
+if LOCK_KEY_pdfplumber not in sys.modules:
+    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
+
+
+def get_project_base_directory(*args):
+    global PROJECT_BASE
+    if PROJECT_BASE is None:
+        PROJECT_BASE = os.path.abspath(
+            os.path.join(
+                os.path.dirname(os.path.realpath(__file__)),
+                os.pardir,
+                os.pardir,
+            )
+        )
+
+    if args:
+        return os.path.join(PROJECT_BASE, *args)
+    return PROJECT_BASE
+
+
+def get_rag_directory(*args):
+    global RAG_BASE
+    if RAG_BASE is None:
+        RAG_BASE = os.path.abspath(
+            os.path.join(
+                os.path.dirname(os.path.realpath(__file__)),
+                os.pardir,
+                os.pardir,
+                os.pardir,
+            )
+        )
+    if args:
+        return os.path.join(RAG_BASE, *args)
+    return RAG_BASE
+
+
+def get_rag_python_directory(*args):
+    return get_rag_directory("python", *args)
+
+
+def get_home_cache_dir():
+    dir = os.path.join(os.path.expanduser("~"), ".ragflow")
+    try:
+        os.mkdir(dir)
+    except OSError:
+        pass
+    return dir
+
+
+@cached(cache=LRUCache(maxsize=10))
+def load_json_conf(conf_path):
+    if os.path.isabs(conf_path):
+        json_conf_path = conf_path
+    else:
+        json_conf_path = os.path.join(get_project_base_directory(), conf_path)
+    try:
+        with open(json_conf_path) as f:
+            return json.load(f)
+    except BaseException:
+        raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
+
+
+def dump_json_conf(config_data, conf_path):
+    if os.path.isabs(conf_path):
+        json_conf_path = conf_path
+    else:
+        json_conf_path = os.path.join(get_project_base_directory(), conf_path)
+    try:
+        with open(json_conf_path, "w") as f:
+            json.dump(config_data, f, indent=4)
+    except BaseException:
+        raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
+
+
+def load_json_conf_real_time(conf_path):
+    if os.path.isabs(conf_path):
+        json_conf_path = conf_path
+    else:
+        json_conf_path = os.path.join(get_project_base_directory(), conf_path)
+    try:
+        with open(json_conf_path) as f:
+            return json.load(f)
+    except BaseException:
+        raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
+
+
+def load_yaml_conf(conf_path):
+    if not os.path.isabs(conf_path):
+        conf_path = os.path.join(get_project_base_directory(), conf_path)
+    try:
+        with open(conf_path) as f:
+            yaml = YAML(typ="safe", pure=True)
+            return yaml.load(f)
+    except Exception as e:
+        raise EnvironmentError("loading yaml file config from {} failed:".format(conf_path), e)
+
+
+def rewrite_yaml_conf(conf_path, config):
+    if not os.path.isabs(conf_path):
+        conf_path = os.path.join(get_project_base_directory(), conf_path)
+    try:
+        with open(conf_path, "w") as f:
+            yaml = YAML(typ="safe")
+            yaml.dump(config, f)
+    except Exception as e:
+        raise EnvironmentError("rewrite yaml file config {} failed:".format(conf_path), e)
+
+
+def rewrite_json_file(filepath, json_data):
+    with open(filepath, "w", encoding="utf-8") as f:
+        json.dump(json_data, f, indent=4, separators=(",", ": "))
+    f.close()
+
+
+def filename_type(filename):
+    filename = filename.lower()
+    if re.match(r".*\.pdf$", filename):
+        return FileType.PDF.value
+
+    if re.match(r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
+        return FileType.DOC.value
+
+    if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
+        return FileType.AURAL.value
+
+    if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
+        return FileType.VISUAL.value
+
+    return FileType.OTHER.value
+
+
+def thumbnail_img(filename, blob):
+    """
+    MySQL LongText max length is 65535
+    """
+    filename = filename.lower()
+    if re.match(r".*\.pdf$", filename):
+        with sys.modules[LOCK_KEY_pdfplumber]:
+            pdf = pdfplumber.open(BytesIO(blob))
+
+            buffered = BytesIO()
+            resolution = 32
+            img = None
+            for _ in range(10):
+                # https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
+                pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
+                img = buffered.getvalue()
+                if len(img) >= 64000 and resolution >= 2:
+                    resolution = resolution / 2
+                    buffered = BytesIO()
+                else:
+                    break
+        pdf.close()
+        return img
+
+    elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
+        image = Image.open(BytesIO(blob))
+        image.thumbnail((30, 30))
+        buffered = BytesIO()
+        image.save(buffered, format="png")
+        return buffered.getvalue()
+
+    elif re.match(r".*\.(ppt|pptx)$", filename):
+        import aspose.pydrawing as drawing
+        import aspose.slides as slides
+
+        try:
+            with slides.Presentation(BytesIO(blob)) as presentation:
+                buffered = BytesIO()
+                scale = 0.03
+                img = None
+                for _ in range(10):
+                    # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
+                    presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
+                    img = buffered.getvalue()
+                    if len(img) >= 64000:
+                        scale = scale / 2.0
+                        buffered = BytesIO()
+                    else:
+                        break
+                return img
+        except Exception:
+            pass
+    return None
+
+
+def thumbnail(filename, blob):
+    img = thumbnail_img(filename, blob)
+    if img is not None:
+        return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
+    else:
+        return ""
+
+
+def traversal_files(base):
+    for root, ds, fs in os.walk(base):
+        for f in fs:
+            fullname = os.path.join(root, f)
+            yield fullname
+
+
+def repair_pdf_with_ghostscript(input_bytes):
+    if shutil.which("gs") is None:
+        return input_bytes
+
+    with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
+        temp_in.write(input_bytes)
+        temp_in.flush()
+
+        cmd = [
+            "gs",
+            "-o",
+            temp_out.name,
+            "-sDEVICE=pdfwrite",
+            "-dPDFSETTINGS=/prepress",
+            temp_in.name,
+        ]
+        try:
+            proc = subprocess.run(cmd, capture_output=True, text=True)
+            if proc.returncode != 0:
+                return input_bytes
+        except Exception:
+            return input_bytes
+
+        temp_out.seek(0)
+        repaired_bytes = temp_out.read()
+
+    return repaired_bytes
+
+
+def read_potential_broken_pdf(blob):
+    def try_open(blob):
+        try:
+            with pdfplumber.open(BytesIO(blob)) as pdf:
+                if pdf.pages:
+                    return True
+        except Exception:
+            return False
+        return False
+
+    if try_open(blob):
+        return blob
+
+    repaired = repair_pdf_with_ghostscript(blob)
+    if try_open(repaired):
+        return repaired
+
+    return blob
--- a/api/utils/health.py
+++ b/api/utils/health.py
@@ -0,0 +1,104 @@
+from timeit import default_timer as timer
+
+from api import settings
+from api.db.db_models import DB
+from rag.utils.redis_conn import REDIS_CONN
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+def _ok_nok(ok: bool) -> str:
+    return "ok" if ok else "nok"
+
+
+def check_db() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        # lightweight probe; works for MySQL/Postgres
+        DB.execute_sql("SELECT 1")
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_redis() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        ok = bool(REDIS_CONN.health())
+        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_doc_engine() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        meta = settings.docStoreConn.health()
+        # treat any successful call as ok
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_storage() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        STORAGE_IMPL.health()
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_chat() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        cfg = getattr(settings, "CHAT_CFG", None)
+        ok = bool(cfg and cfg.get("factory"))
+        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def run_health_checks() -> tuple[dict, bool]:
+    result: dict[str, str | dict] = {}
+
+    db_ok, db_meta = check_db()
+    chat_ok, chat_meta = check_chat()
+
+    result["db"] = _ok_nok(db_ok)
+    if not db_ok:
+        result.setdefault("_meta", {})["db"] = db_meta
+
+    result["chat"] = _ok_nok(chat_ok)
+    if not chat_ok:
+        result.setdefault("_meta", {})["chat"] = chat_meta
+
+    # Optional probes (do not change minimal contract but exposed for observability)
+    try:
+        redis_ok, redis_meta = check_redis()
+        result["redis"] = _ok_nok(redis_ok)
+        if not redis_ok:
+            result.setdefault("_meta", {})["redis"] = redis_meta
+    except Exception:
+        result["redis"] = "nok"
+
+    try:
+        doc_ok, doc_meta = check_doc_engine()
+        result["doc_engine"] = _ok_nok(doc_ok)
+        if not doc_ok:
+            result.setdefault("_meta", {})["doc_engine"] = doc_meta
+    except Exception:
+        result["doc_engine"] = "nok"
+
+    try:
+        sto_ok, sto_meta = check_storage()
+        result["storage"] = _ok_nok(sto_ok)
+        if not sto_ok:
+            result.setdefault("_meta", {})["storage"] = sto_meta
+    except Exception:
+        result["storage"] = "nok"
+
+    all_ok = (result.get("db") == "ok") and (result.get("chat") == "ok")
+    result["status"] = "ok" if all_ok else "nok"
+    return result, all_ok
+
+
--- a/api/utils/health_utils.py
+++ b/api/utils/health_utils.py
@@ -0,0 +1,200 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import requests
+from timeit import default_timer as timer
+
+from api import settings
+from api.db.db_models import DB
+from rag import settings as rag_settings
+from rag.utils.redis_conn import REDIS_CONN
+from rag.utils.storage_factory import STORAGE_IMPL
+from rag.utils.es_conn import ESConnection
+from rag.utils.infinity_conn import InfinityConnection
+
+
+def _ok_nok(ok: bool) -> str:
+    return "ok" if ok else "nok"
+
+
+def check_db() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        # lightweight probe; works for MySQL/Postgres
+        DB.execute_sql("SELECT 1")
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_redis() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        ok = bool(REDIS_CONN.health())
+        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_doc_engine() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        meta = settings.docStoreConn.health()
+        # treat any successful call as ok
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_storage() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        STORAGE_IMPL.health()
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def get_es_cluster_stats() -> dict:
+    doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
+    if doc_engine != 'elasticsearch':
+        raise Exception("Elasticsearch is not in use.")
+    try:
+        return {
+            "alive": True,
+            "message": ESConnection().get_cluster_stats()
+        }
+    except Exception as e:
+        return {
+            "alive": False,
+            "message": f"error: {str(e)}",
+        }
+
+
+def get_infinity_status():
+    doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
+    if doc_engine != 'infinity':
+        raise Exception("Infinity is not in use.")
+    try:
+        return {
+            "alive": True,
+            "message": InfinityConnection().health()
+        }
+    except Exception as e:
+        return {
+            "alive": False,
+            "message": f"error: {str(e)}",
+        }
+
+
+def get_mysql_status():
+    try:
+        cursor = DB.execute_sql("SHOW PROCESSLIST;")
+        res_rows = cursor.fetchall()
+        headers = ['id', 'user', 'host', 'db', 'command', 'time', 'state', 'info']
+        cursor.close()
+        return {
+            "alive": True,
+            "message": [dict(zip(headers, r)) for r in res_rows]
+        }
+    except Exception as e:
+        return {
+            "alive": False,
+            "message": f"error: {str(e)}",
+        }
+
+
+def check_minio_alive():
+    start_time = timer()
+    try:
+        response = requests.get(f'http://{rag_settings.MINIO["host"]}/minio/health/live')
+        if response.status_code == 200:
+            return {'alive': True, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
+        else:
+            return {'alive': False, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
+    except Exception as e:
+        return {
+            "alive": False,
+            "message": f"error: {str(e)}",
+        }
+
+
+def get_redis_info():
+    try:
+        return {
+            "alive": True,
+            "message": REDIS_CONN.info()
+        }
+    except Exception as e:
+        return {
+            "alive": False,
+            "message": f"error: {str(e)}",
+        }
+
+
+def check_ragflow_server_alive():
+    start_time = timer()
+    try:
+        response = requests.get(f'http://{settings.HOST_IP}:{settings.HOST_PORT}/v1/system/ping')
+        if response.status_code == 200:
+            return {'alive': True, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
+        else:
+            return {'alive': False, "message": f"Confirm elapsed: {(timer() - start_time) * 1000.0:.1f} ms."}
+    except Exception as e:
+        return {
+            "alive": False,
+            "message": f"error: {str(e)}",
+        }
+
+
+def run_health_checks() -> tuple[dict, bool]:
+    result: dict[str, str | dict] = {}
+
+    db_ok, db_meta = check_db()
+    result["db"] = _ok_nok(db_ok)
+    if not db_ok:
+        result.setdefault("_meta", {})["db"] = db_meta
+
+    try:
+        redis_ok, redis_meta = check_redis()
+        result["redis"] = _ok_nok(redis_ok)
+        if not redis_ok:
+            result.setdefault("_meta", {})["redis"] = redis_meta
+    except Exception:
+        result["redis"] = "nok"
+
+    try:
+        doc_ok, doc_meta = check_doc_engine()
+        result["doc_engine"] = _ok_nok(doc_ok)
+        if not doc_ok:
+            result.setdefault("_meta", {})["doc_engine"] = doc_meta
+    except Exception:
+        result["doc_engine"] = "nok"
+
+    try:
+        sto_ok, sto_meta = check_storage()
+        result["storage"] = _ok_nok(sto_ok)
+        if not sto_ok:
+            result.setdefault("_meta", {})["storage"] = sto_meta
+    except Exception:
+        result["storage"] = "nok"
+
+
+    all_ok = (result.get("db") == "ok") and (result.get("redis") == "ok") and (result.get("doc_engine") == "ok") and (result.get("storage") == "ok")
+    result["status"] = "ok" if all_ok else "nok"
+    return result, all_ok
+
+
--- a/api/utils/json.py
+++ b/api/utils/json.py
@@ -0,0 +1,78 @@
+import datetime
+import json
+from enum import Enum, IntEnum
+from api.utils.common import string_to_bytes, bytes_to_string
+
+
+class BaseType:
+    def to_dict(self):
+        return dict([(k.lstrip("_"), v) for k, v in self.__dict__.items()])
+
+    def to_dict_with_type(self):
+        def _dict(obj):
+            module = None
+            if issubclass(obj.__class__, BaseType):
+                data = {}
+                for attr, v in obj.__dict__.items():
+                    k = attr.lstrip("_")
+                    data[k] = _dict(v)
+                module = obj.__module__
+            elif isinstance(obj, (list, tuple)):
+                data = []
+                for i, vv in enumerate(obj):
+                    data.append(_dict(vv))
+            elif isinstance(obj, dict):
+                data = {}
+                for _k, vv in obj.items():
+                    data[_k] = _dict(vv)
+            else:
+                data = obj
+            return {"type": obj.__class__.__name__,
+                    "data": data, "module": module}
+
+        return _dict(self)
+
+
+class CustomJSONEncoder(json.JSONEncoder):
+    def __init__(self, **kwargs):
+        self._with_type = kwargs.pop("with_type", False)
+        super().__init__(**kwargs)
+
+    def default(self, obj):
+        if isinstance(obj, datetime.datetime):
+            return obj.strftime('%Y-%m-%d %H:%M:%S')
+        elif isinstance(obj, datetime.date):
+            return obj.strftime('%Y-%m-%d')
+        elif isinstance(obj, datetime.timedelta):
+            return str(obj)
+        elif issubclass(type(obj), Enum) or issubclass(type(obj), IntEnum):
+            return obj.value
+        elif isinstance(obj, set):
+            return list(obj)
+        elif issubclass(type(obj), BaseType):
+            if not self._with_type:
+                return obj.to_dict()
+            else:
+                return obj.to_dict_with_type()
+        elif isinstance(obj, type):
+            return obj.__name__
+        else:
+            return json.JSONEncoder.default(self, obj)
+
+
+def json_dumps(src, byte=False, indent=None, with_type=False):
+    dest = json.dumps(
+        src,
+        indent=indent,
+        cls=CustomJSONEncoder,
+        with_type=with_type)
+    if byte:
+        dest = string_to_bytes(dest)
+    return dest
+
+
+def json_loads(src, object_hook=None, object_pairs_hook=None):
+    if isinstance(src, bytes):
+        src = bytes_to_string(src)
+    return json.loads(src, object_hook=object_hook,
+                      object_pairs_hook=object_pairs_hook)
--- a/api/utils/log_utils.py
+++ b/api/utils/log_utils.py
@@ -0,0 +1,91 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import os.path
+import logging
+from logging.handlers import RotatingFileHandler
+
+initialized_root_logger = False
+
+def get_project_base_directory():
+    PROJECT_BASE = os.path.abspath(
+        os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            os.pardir,
+            os.pardir,
+        )
+    )
+    return PROJECT_BASE
+
+def init_root_logger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"):
+    global initialized_root_logger
+    if initialized_root_logger:
+        return
+    initialized_root_logger = True
+
+    logger = logging.getLogger()
+    logger.handlers.clear()
+    log_path = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"{logfile_basename}.log"))
+
+    os.makedirs(os.path.dirname(log_path), exist_ok=True)
+    formatter = logging.Formatter(log_format)
+
+    handler1 = RotatingFileHandler(log_path, maxBytes=10*1024*1024, backupCount=5)
+    handler1.setFormatter(formatter)
+    logger.addHandler(handler1)
+
+    handler2 = logging.StreamHandler()
+    handler2.setFormatter(formatter)
+    logger.addHandler(handler2)
+
+    logging.captureWarnings(True)
+
+    LOG_LEVELS = os.environ.get("LOG_LEVELS", "")
+    pkg_levels = {}
+    for pkg_name_level in LOG_LEVELS.split(","):
+        terms = pkg_name_level.split("=")
+        if len(terms)!= 2:
+            continue
+        pkg_name, pkg_level = terms[0], terms[1]
+        pkg_name = pkg_name.strip()
+        pkg_level = logging.getLevelName(pkg_level.strip().upper())
+        if not isinstance(pkg_level, int):
+            pkg_level = logging.INFO
+        pkg_levels[pkg_name] = logging.getLevelName(pkg_level)
+
+    for pkg_name in ['peewee', 'pdfminer']:
+        if pkg_name not in pkg_levels:
+            pkg_levels[pkg_name] = logging.getLevelName(logging.WARNING)
+    if 'root' not in pkg_levels:
+        pkg_levels['root'] = logging.getLevelName(logging.INFO)
+
+    for pkg_name, pkg_level in pkg_levels.items():
+        pkg_logger = logging.getLogger(pkg_name)
+        pkg_logger.setLevel(pkg_level)
+
+    msg = f"{logfile_basename} log path: {log_path}, log levels: {pkg_levels}"
+    logger.info(msg)
+
+
+def log_exception(e, *args):
+    logging.exception(e)
+    for a in args:
+        if hasattr(a, "text"):
+            logging.error(a.text)
+            raise Exception(a.text)
+        else:
+            logging.error(str(a))
+    raise e
--- a/api/utils/validation_utils.py
+++ b/api/utils/validation_utils.py
@@ -0,0 +1,636 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from collections import Counter
+from typing import Annotated, Any, Literal
+from uuid import UUID
+
+from flask import Request
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    StringConstraints,
+    ValidationError,
+    field_validator,
+)
+from pydantic_core import PydanticCustomError
+from werkzeug.exceptions import BadRequest, UnsupportedMediaType
+
+from api.constants import DATASET_NAME_LIMIT
+
+
+def validate_and_parse_json_request(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None, exclude_unset: bool = False) -> tuple[dict[str, Any] | None, str | None]:
+    """
+    Validates and parses JSON requests through a multi-stage validation pipeline.
+
+    Implements a four-stage validation process:
+    1. Content-Type verification (must be application/json)
+    2. JSON syntax validation
+    3. Payload structure type checking
+    4. Pydantic model validation with error formatting
+
+    Args:
+        request (Request): Flask request object containing HTTP payload
+        validator (type[BaseModel]): Pydantic model class for data validation
+        extras (dict[str, Any] | None): Additional fields to merge into payload
+            before validation. These fields will be removed from the final output
+        exclude_unset (bool): Whether to exclude fields that have not been explicitly set
+
+    Returns:
+        tuple[Dict[str, Any] | None, str | None]:
+        - First element:
+            - Validated dictionary on success
+            - None on validation failure
+        - Second element:
+            - None on success
+            - Diagnostic error message on failure
+
+    Raises:
+        UnsupportedMediaType: When Content-Type header is not application/json
+        BadRequest: For structural JSON syntax errors
+        ValidationError: When payload violates Pydantic schema rules
+
+    Examples:
+        >>> validate_and_parse_json_request(valid_request, DatasetSchema)
+        ({"name": "Dataset1", "format": "csv"}, None)
+
+        >>> validate_and_parse_json_request(xml_request, DatasetSchema)
+        (None, "Unsupported content type: Expected application/json, got text/xml")
+
+        >>> validate_and_parse_json_request(bad_json_request, DatasetSchema)
+        (None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
+
+    Notes:
+        1. Validation Priority:
+            - Content-Type verification precedes JSON parsing
+            - Structural validation occurs before schema validation
+        2. Extra fields added via `extras` parameter are automatically removed
+           from the final output after validation
+    """
+    try:
+        payload = request.get_json() or {}
+    except UnsupportedMediaType:
+        return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
+    except BadRequest:
+        return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
+
+    if not isinstance(payload, dict):
+        return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
+
+    try:
+        if extras is not None:
+            payload.update(extras)
+        validated_request = validator(**payload)
+    except ValidationError as e:
+        return None, format_validation_error_message(e)
+
+    parsed_payload = validated_request.model_dump(by_alias=True, exclude_unset=exclude_unset)
+
+    if extras is not None:
+        for key in list(parsed_payload.keys()):
+            if key in extras:
+                del parsed_payload[key]
+
+    return parsed_payload, None
+
+
+def validate_and_parse_request_args(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None) -> tuple[dict[str, Any] | None, str | None]:
+    """
+    Validates and parses request arguments against a Pydantic model.
+
+    This function performs a complete request validation workflow:
+    1. Extracts query parameters from the request
+    2. Merges with optional extra values (if provided)
+    3. Validates against the specified Pydantic model
+    4. Cleans the output by removing extra values
+    5. Returns either parsed data or an error message
+
+    Args:
+        request (Request): Web framework request object containing query parameters
+        validator (type[BaseModel]): Pydantic model class for validation
+        extras (dict[str, Any] | None): Optional additional values to include in validation
+                                      but exclude from final output. Defaults to None.
+
+    Returns:
+        tuple[dict[str, Any] | None, str | None]:
+            - First element: Validated/parsed arguments as dict if successful, None otherwise
+            - Second element: Formatted error message if validation failed, None otherwise
+
+    Behavior:
+        - Query parameters are merged with extras before validation
+        - Extras are automatically removed from the final output
+        - All validation errors are formatted into a human-readable string
+
+    Raises:
+        TypeError: If validator is not a Pydantic BaseModel subclass
+
+    Examples:
+        Successful validation:
+            >>> validate_and_parse_request_args(request, MyValidator)
+            ({'param1': 'value'}, None)
+
+        Failed validation:
+            >>> validate_and_parse_request_args(request, MyValidator)
+            (None, "param1: Field required")
+
+        With extras:
+            >>> validate_and_parse_request_args(request, MyValidator, extras={'internal_id': 123})
+            ({'param1': 'value'}, None)  # internal_id removed from output
+
+    Notes:
+        - Uses request.args.to_dict() for Flask-compatible parameter extraction
+        - Maintains immutability of original request arguments
+        - Preserves type conversion from Pydantic validation
+    """
+    args = request.args.to_dict(flat=True)
+    try:
+        if extras is not None:
+            args.update(extras)
+        validated_args = validator(**args)
+    except ValidationError as e:
+        return None, format_validation_error_message(e)
+
+    parsed_args = validated_args.model_dump()
+    if extras is not None:
+        for key in list(parsed_args.keys()):
+            if key in extras:
+                del parsed_args[key]
+
+    return parsed_args, None
+
+
+def format_validation_error_message(e: ValidationError) -> str:
+    """
+    Formats validation errors into a standardized string format.
+
+    Processes pydantic ValidationError objects to create human-readable error messages
+    containing field locations, error descriptions, and input values.
+
+    Args:
+        e (ValidationError): The validation error instance containing error details
+
+    Returns:
+        str: Formatted error messages joined by newlines. Each line contains:
+            - Field path (dot-separated)
+            - Error message
+            - Truncated input value (max 128 chars)
+
+    Example:
+        >>> try:
+        ...     UserModel(name=123, email="invalid")
+        ... except ValidationError as e:
+        ...     print(format_validation_error_message(e))
+        Field: <name> - Message: <Input should be a valid string> - Value: <123>
+        Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
+    """
+    error_messages = []
+
+    for error in e.errors():
+        field = ".".join(map(str, error["loc"]))
+        msg = error["msg"]
+        input_val = error["input"]
+        input_str = str(input_val)
+
+        if len(input_str) > 128:
+            input_str = input_str[:125] + "..."
+
+        error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
+        error_messages.append(error_msg)
+
+    return "\n".join(error_messages)
+
+
+def normalize_str(v: Any) -> Any:
+    """
+    Normalizes string values to a standard format while preserving non-string inputs.
+
+    Performs the following transformations when input is a string:
+    1. Trims leading/trailing whitespace (str.strip())
+    2. Converts to lowercase (str.lower())
+
+    Non-string inputs are returned unchanged, making this function safe for mixed-type
+    processing pipelines.
+
+    Args:
+        v (Any): Input value to normalize. Accepts any Python object.
+
+    Returns:
+        Any: Normalized string if input was string-type, original value otherwise.
+
+    Behavior Examples:
+        String Input: "  Admin " → "admin"
+        Empty String: "   " → "" (empty string)
+        Non-String:
+            - 123 → 123
+            - None → None
+            - ["User"] → ["User"]
+
+    Typical Use Cases:
+        - Standardizing user input
+        - Preparing data for case-insensitive comparison
+        - Cleaning API parameters
+        - Normalizing configuration values
+
+    Edge Cases:
+        - Unicode whitespace is handled by str.strip()
+        - Locale-independent lowercasing (str.lower())
+        - Preserves falsy values (0, False, etc.)
+
+    Example:
+        >>> normalize_str("  ReadOnly  ")
+        'readonly'
+        >>> normalize_str(42)
+        42
+    """
+    if isinstance(v, str):
+        stripped = v.strip()
+        normalized = stripped.lower()
+        return normalized
+    return v
+
+
+def validate_uuid1_hex(v: Any) -> str:
+    """
+    Validates and converts input to a UUID version 1 hexadecimal string.
+
+    This function performs strict validation and normalization:
+    1. Accepts either UUID objects or UUID-formatted strings
+    2. Verifies the UUID is version 1 (time-based)
+    3. Returns the 32-character hexadecimal representation
+
+    Args:
+        v (Any): Input value to validate. Can be:
+                - UUID object (must be version 1)
+                - String in UUID format (e.g. "550e8400-e29b-41d4-a716-446655440000")
+
+    Returns:
+        str: 32-character lowercase hexadecimal string without hyphens
+             Example: "550e8400e29b41d4a716446655440000"
+
+    Raises:
+        PydanticCustomError: With code "invalid_UUID1_format" when:
+            - Input is not a UUID object or valid UUID string
+            - UUID version is not 1
+            - String doesn't match UUID format
+
+    Examples:
+        Valid cases:
+            >>> validate_uuid1_hex("550e8400-e29b-41d4-a716-446655440000")
+            '550e8400e29b41d4a716446655440000'
+            >>> validate_uuid1_hex(UUID('550e8400-e29b-41d4-a716-446655440000'))
+            '550e8400e29b41d4a716446655440000'
+
+        Invalid cases:
+            >>> validate_uuid1_hex("not-a-uuid")  # raises PydanticCustomError
+            >>> validate_uuid1_hex(12345)  # raises PydanticCustomError
+            >>> validate_uuid1_hex(UUID(int=0))  # v4, raises PydanticCustomError
+
+    Notes:
+        - Uses Python's built-in UUID parser for format validation
+        - Version check prevents accidental use of other UUID versions
+        - Hyphens in input strings are automatically removed in output
+    """
+    try:
+        uuid_obj = UUID(v) if isinstance(v, str) else v
+        if uuid_obj.version != 1:
+            raise PydanticCustomError("invalid_UUID1_format", "Must be a UUID1 format")
+        return uuid_obj.hex
+    except (AttributeError, ValueError, TypeError):
+        raise PydanticCustomError("invalid_UUID1_format", "Invalid UUID1 format")
+
+
+class Base(BaseModel):
+    model_config = ConfigDict(extra="forbid", strict=True)
+
+
+class RaptorConfig(Base):
+    use_raptor: Annotated[bool, Field(default=False)]
+    prompt: Annotated[
+        str,
+        StringConstraints(strip_whitespace=True, min_length=1),
+        Field(
+            default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize."
+        ),
+    ]
+    max_token: Annotated[int, Field(default=256, ge=1, le=2048)]
+    threshold: Annotated[float, Field(default=0.1, ge=0.0, le=1.0)]
+    max_cluster: Annotated[int, Field(default=64, ge=1, le=1024)]
+    random_seed: Annotated[int, Field(default=0, ge=0)]
+
+
+class GraphragConfig(Base):
+    use_graphrag: Annotated[bool, Field(default=False)]
+    entity_types: Annotated[list[str], Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])]
+    method: Annotated[Literal["light", "general"], Field(default="light")]
+    community: Annotated[bool, Field(default=False)]
+    resolution: Annotated[bool, Field(default=False)]
+
+
+class ParserConfig(Base):
+    auto_keywords: Annotated[int, Field(default=0, ge=0, le=32)]
+    auto_questions: Annotated[int, Field(default=0, ge=0, le=10)]
+    chunk_token_num: Annotated[int, Field(default=512, ge=1, le=2048)]
+    delimiter: Annotated[str, Field(default=r"\n", min_length=1)]
+    graphrag: Annotated[GraphragConfig, Field(default_factory=lambda: GraphragConfig(use_graphrag=False))]
+    html4excel: Annotated[bool, Field(default=False)]
+    layout_recognize: Annotated[str, Field(default="DeepDOC")]
+    raptor: Annotated[RaptorConfig, Field(default_factory=lambda: RaptorConfig(use_raptor=False))]
+    tag_kb_ids: Annotated[list[str], Field(default_factory=list)]
+    topn_tags: Annotated[int, Field(default=1, ge=1, le=10)]
+    filename_embd_weight: Annotated[float | None, Field(default=0.1, ge=0.0, le=1.0)]
+    task_page_size: Annotated[int | None, Field(default=None, ge=1)]
+    pages: Annotated[list[list[int]] | None, Field(default=None)]
+
+
+class CreateDatasetReq(Base):
+    name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
+    avatar: Annotated[str | None, Field(default=None, max_length=65535)]
+    description: Annotated[str | None, Field(default=None, max_length=65535)]
+    embedding_model: Annotated[str | None, Field(default=None, max_length=255, serialization_alias="embd_id")]
+    permission: Annotated[Literal["me", "team"], Field(default="me", min_length=1, max_length=16)]
+    chunk_method: Annotated[
+        Literal["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
+        Field(default="naive", min_length=1, max_length=32, serialization_alias="parser_id"),
+    ]
+    parser_config: Annotated[ParserConfig | None, Field(default=None)]
+
+    @field_validator("avatar", mode="after")
+    @classmethod
+    def validate_avatar_base64(cls, v: str | None) -> str | None:
+        """
+        Validates Base64-encoded avatar string format and MIME type compliance.
+
+        Implements a three-stage validation workflow:
+        1. MIME prefix existence check
+        2. MIME type format validation
+        3. Supported type verification
+
+        Args:
+            v (str): Raw avatar field value
+
+        Returns:
+            str: Validated Base64 string
+
+        Raises:
+            PydanticCustomError: For structural errors in these cases:
+                - Missing MIME prefix header
+                - Invalid MIME prefix format
+                - Unsupported image MIME type
+
+        Example:
+            ```python
+            # Valid case
+            CreateDatasetReq(avatar="data:image/png;base64,iVBORw0KGg...")
+
+            # Invalid cases
+            CreateDatasetReq(avatar="image/jpeg;base64,...")  # Missing 'data:' prefix
+            CreateDatasetReq(avatar="data:video/mp4;base64,...")  # Unsupported MIME type
+            ```
+        """
+        if v is None:
+            return v
+
+        if "," in v:
+            prefix, _ = v.split(",", 1)
+            if not prefix.startswith("data:"):
+                raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")
+
+            mime_type = prefix[5:].split(";")[0]
+            supported_mime_types = ["image/jpeg", "image/png"]
+            if mime_type not in supported_mime_types:
+                raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})
+
+            return v
+        else:
+            raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
+
+    @field_validator("embedding_model", mode="before")
+    @classmethod
+    def normalize_embedding_model(cls, v: Any) -> Any:
+        """Normalize embedding model string by stripping whitespace"""
+        if isinstance(v, str):
+            return v.strip()
+        return v
+
+    @field_validator("embedding_model", mode="after")
+    @classmethod
+    def validate_embedding_model(cls, v: str | None) -> str | None:
+        """
+        Validates embedding model identifier format compliance.
+
+        Validation pipeline:
+        1. Structural format verification
+        2. Component non-empty check
+        3. Value normalization
+
+        Args:
+            v (str): Raw model identifier
+
+        Returns:
+            str: Validated <model_name>@<provider> format
+
+        Raises:
+            PydanticCustomError: For these violations:
+                - Missing @ separator
+                - Empty model_name/provider
+                - Invalid component structure
+
+        Examples:
+            Valid: "text-embedding-3-large@openai"
+            Invalid: "invalid_model" (no @)
+            Invalid: "@openai" (empty model_name)
+            Invalid: "text-embedding-3-large@" (empty provider)
+        """
+        if isinstance(v, str):
+            if "@" not in v:
+                raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")
+
+            components = v.split("@", 1)
+            if len(components) != 2 or not all(components):
+                raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")
+
+            model_name, provider = components
+            if not model_name.strip() or not provider.strip():
+                raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
+        return v
+
+    # @field_validator("permission", mode="before")
+    # @classmethod
+    # def normalize_permission(cls, v: Any) -> Any:
+    #     return normalize_str(v)
+
+    @field_validator("parser_config", mode="before")
+    @classmethod
+    def normalize_empty_parser_config(cls, v: Any) -> Any:
+        """
+        Normalizes empty parser configuration by converting empty dictionaries to None.
+
+        This validator ensures consistent handling of empty parser configurations across
+        the application by converting empty dicts to None values.
+
+        Args:
+            v (Any): Raw input value for the parser config field
+
+        Returns:
+            Any: Returns None if input is an empty dict, otherwise returns the original value
+
+        Example:
+            >>> normalize_empty_parser_config({})
+            None
+
+            >>> normalize_empty_parser_config({"key": "value"})
+            {"key": "value"}
+        """
+        if v == {}:
+            return None
+        return v
+
+    @field_validator("parser_config", mode="after")
+    @classmethod
+    def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
+        """
+        Validates serialized JSON length constraints for parser configuration.
+
+        Implements a two-stage validation workflow:
+        1. Null check - bypass validation for empty configurations
+        2. Model serialization - convert Pydantic model to JSON string
+        3. Size verification - enforce maximum allowed payload size
+
+        Args:
+            v (ParserConfig | None): Raw parser configuration object
+
+        Returns:
+            ParserConfig | None: Validated configuration object
+
+        Raises:
+            PydanticCustomError: When serialized JSON exceeds 65,535 characters
+        """
+        if v is None:
+            return None
+
+        if (json_str := v.model_dump_json()) and len(json_str) > 65535:
+            raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
+        return v
+
+
+class UpdateDatasetReq(CreateDatasetReq):
+    dataset_id: Annotated[str, Field(...)]
+    name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(default="")]
+    pagerank: Annotated[int, Field(default=0, ge=0, le=100)]
+
+    @field_validator("dataset_id", mode="before")
+    @classmethod
+    def validate_dataset_id(cls, v: Any) -> str:
+        return validate_uuid1_hex(v)
+
+
+class DeleteReq(Base):
+    ids: Annotated[list[str] | None, Field(...)]
+
+    @field_validator("ids", mode="after")
+    @classmethod
+    def validate_ids(cls, v_list: list[str] | None) -> list[str] | None:
+        """
+        Validates and normalizes a list of UUID strings with None handling.
+
+        This post-processing validator performs:
+        1. None input handling (pass-through)
+        2. UUID version 1 validation for each list item
+        3. Duplicate value detection
+        4. Returns normalized UUID hex strings or None
+
+        Args:
+            v_list (list[str] | None): Input list that has passed initial validation.
+                                    Either a list of UUID strings or None.
+
+        Returns:
+            list[str] | None:
+            - None if input was None
+            - List of normalized UUID hex strings otherwise:
+            * 32-character lowercase
+            * Valid UUID version 1
+            * Unique within list
+
+        Raises:
+            PydanticCustomError: With structured error details when:
+                - "invalid_UUID1_format": Any string fails UUIDv1 validation
+                - "duplicate_uuids": If duplicate IDs are detected
+
+        Validation Rules:
+            - None input returns None
+            - Empty list returns empty list
+            - All non-None items must be valid UUIDv1
+            - No duplicates permitted
+            - Original order preserved
+
+        Examples:
+            Valid cases:
+                >>> validate_ids(None)
+                None
+                >>> validate_ids([])
+                []
+                >>> validate_ids(["550e8400-e29b-41d4-a716-446655440000"])
+                ["550e8400e29b41d4a716446655440000"]
+
+            Invalid cases:
+                >>> validate_ids(["invalid"])
+                # raises PydanticCustomError(invalid_UUID1_format)
+                >>> validate_ids(["550e...", "550e..."])
+                # raises PydanticCustomError(duplicate_uuids)
+
+        Security Notes:
+            - Validates UUID version to prevent version spoofing
+            - Duplicate check prevents data injection
+            - None handling maintains pipeline integrity
+        """
+        if v_list is None:
+            return None
+
+        ids_list = []
+        for v in v_list:
+            try:
+                ids_list.append(validate_uuid1_hex(v))
+            except PydanticCustomError as e:
+                raise e
+
+        duplicates = [item for item, count in Counter(ids_list).items() if count > 1]
+        if duplicates:
+            duplicates_str = ", ".join(duplicates)
+            raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
+
+        return ids_list
+
+
+class DeleteDatasetReq(DeleteReq): ...
+
+
+class BaseListReq(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    id: Annotated[str | None, Field(default=None)]
+    name: Annotated[str | None, Field(default=None)]
+    page: Annotated[int, Field(default=1, ge=1)]
+    page_size: Annotated[int, Field(default=30, ge=1)]
+    orderby: Annotated[Literal["create_time", "update_time"], Field(default="create_time")]
+    desc: Annotated[bool, Field(default=True)]
+
+    @field_validator("id", mode="before")
+    @classmethod
+    def validate_id(cls, v: Any) -> str:
+        return validate_uuid1_hex(v)
+
+
+class ListDatasetReq(BaseListReq): ...
--- a/api/utils/web_utils.py
+++ b/api/utils/web_utils.py
@@ -0,0 +1,201 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import base64
+import ipaddress
+import json
+import re
+import socket
+from urllib.parse import urlparse
+
+from api.apps import smtp_mail_server
+from flask_mail import Message
+from flask import render_template_string
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.expected_conditions import staleness_of
+from selenium.webdriver.support.ui import WebDriverWait
+from webdriver_manager.chrome import ChromeDriverManager
+
+
+
+CONTENT_TYPE_MAP = {
+    # Office
+    "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "doc": "application/msword",
+    "pdf": "application/pdf",
+    "csv": "text/csv",
+    "xls": "application/vnd.ms-excel",
+    "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    # Text/code
+    "txt": "text/plain",
+    "py": "text/plain",
+    "js": "text/plain",
+    "java": "text/plain",
+    "c": "text/plain",
+    "cpp": "text/plain",
+    "h": "text/plain",
+    "php": "text/plain",
+    "go": "text/plain",
+    "ts": "text/plain",
+    "sh": "text/plain",
+    "cs": "text/plain",
+    "kt": "text/plain",
+    "sql": "text/plain",
+    # Web
+    "md": "text/markdown",
+    "markdown": "text/markdown",
+    "htm": "text/html",
+    "html": "text/html",
+    "json": "application/json",
+    # Image formats
+    "png": "image/png",
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "gif": "image/gif",
+    "bmp": "image/bmp",
+    "tiff": "image/tiff",
+    "tif": "image/tiff",
+    "webp": "image/webp",
+    "svg": "image/svg+xml",
+    "ico": "image/x-icon",
+    "avif": "image/avif",
+    "heic": "image/heic",
+}
+
+
+def html2pdf(
+    source: str,
+    timeout: int = 2,
+    install_driver: bool = True,
+    print_options: dict = {},
+):
+    result = __get_pdf_from_html(source, timeout, install_driver, print_options)
+    return result
+
+
+def __send_devtools(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({"cmd": cmd, "params": params})
+    response = driver.command_executor._request("POST", url, body)
+
+    if not response:
+        raise Exception(response.get("value"))
+
+    return response.get("value")
+
+
+def __get_pdf_from_html(path: str, timeout: int, install_driver: bool, print_options: dict):
+    webdriver_options = Options()
+    webdriver_prefs = {}
+    webdriver_options.add_argument("--headless")
+    webdriver_options.add_argument("--disable-gpu")
+    webdriver_options.add_argument("--no-sandbox")
+    webdriver_options.add_argument("--disable-dev-shm-usage")
+    webdriver_options.experimental_options["prefs"] = webdriver_prefs
+
+    webdriver_prefs["profile.default_content_settings"] = {"images": 2}
+
+    if install_driver:
+        service = Service(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service, options=webdriver_options)
+    else:
+        driver = webdriver.Chrome(options=webdriver_options)
+
+    driver.get(path)
+
+    try:
+        WebDriverWait(driver, timeout).until(staleness_of(driver.find_element(by=By.TAG_NAME, value="html")))
+    except TimeoutException:
+        calculated_print_options = {
+            "landscape": False,
+            "displayHeaderFooter": False,
+            "printBackground": True,
+            "preferCSSPageSize": True,
+        }
+        calculated_print_options.update(print_options)
+        result = __send_devtools(driver, "Page.printToPDF", calculated_print_options)
+        driver.quit()
+        return base64.b64decode(result["data"])
+
+
+def is_private_ip(ip: str) -> bool:
+    try:
+        ip_obj = ipaddress.ip_address(ip)
+        return ip_obj.is_private
+    except ValueError:
+        return False
+
+
+def is_valid_url(url: str) -> bool:
+    if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url):
+        return False
+    parsed_url = urlparse(url)
+    hostname = parsed_url.hostname
+
+    if not hostname:
+        return False
+    try:
+        ip = socket.gethostbyname(hostname)
+        if is_private_ip(ip):
+            return False
+    except socket.gaierror:
+        return False
+    return True
+
+
+def safe_json_parse(data: str | dict) -> dict:
+    if isinstance(data, dict):
+        return data
+    try:
+        return json.loads(data) if data else {}
+    except (json.JSONDecodeError, TypeError):
+        return {}
+
+
+def get_float(req: dict, key: str, default: float | int = 10.0) -> float:
+    try:
+        parsed = float(req.get(key, default))
+        return parsed if parsed > 0 else default
+    except (TypeError, ValueError):
+        return default
+
+
+INVITE_EMAIL_TMPL = """
+<p>Hi {{email}},</p>
+<p>{{inviter}} has invited you to join their team (ID: {{tenant_id}}).</p>
+<p>Click the link below to complete your registration:<br>
+<a href="{{invite_url}}">{{invite_url}}</a></p>
+<p>If you did not request this, please ignore this email.</p>
+"""
+
+def send_invite_email(to_email, invite_url, tenant_id, inviter):
+    from api.apps import  app
+    with app.app_context():
+        msg = Message(subject="RAGFlow Invitation",
+                      recipients=[to_email])
+        msg.html = render_template_string(
+            INVITE_EMAIL_TMPL,
+            email=to_email,
+            invite_url=invite_url,
+            tenant_id=tenant_id,
+            inviter=inviter,
+        )
+        smtp_mail_server.send(msg)