[fix]Fixed five known issues, including token authentication and multi-worker operation. (#63)

* 0.6.1Version

* fix 0.5.1 schema async bug

* fix security bug

* fix security bug

* Add complete Token, JWT, OAuth authentication system

* Add complete Token, JWT, OAuth authentication system

* Add complete Token, JWT, OAuth authentication system

* Add complete Token, JWT, OAuth authentication system

* Add a controllable MCP Server DB Pool permission authentication system, connect it with the Doris permission system, and provide it to enterprise-level applications concurrently with the multi-Worker mode.

* Add Tokens Management

* change version

* fix stdio start bug

* fix stdio start bug

* fix stdio start bug
This commit is contained in:
Yijia Su
2025-11-04 14:45:38 +08:00
committed by GitHub
parent 2613912df3
commit a125a2f5f8
6 changed files with 295 additions and 88 deletions

View File

@@ -95,12 +95,14 @@ class DorisConnection:
await cursor.execute(sql, params)
# Check if it's a query statement (statement that returns result set)
# FIX for Issue #62 Bug 5: Added WITH support for Common Table Expressions (CTE)
sql_upper = sql.strip().upper()
if (sql_upper.startswith("SELECT") or
sql_upper.startswith("SHOW") or
sql_upper.startswith("DESCRIBE") or
sql_upper.startswith("DESC") or
sql_upper.startswith("EXPLAIN")):
if (sql_upper.startswith("SELECT") or
sql_upper.startswith("SHOW") or
sql_upper.startswith("DESCRIBE") or
sql_upper.startswith("DESC") or
sql_upper.startswith("EXPLAIN") or
sql_upper.startswith("WITH")): # FIX: Support CTE queries
data = await cursor.fetchall()
row_count = len(data)
else:
@@ -250,7 +252,16 @@ class DorisConnectionManager:
self.logger = get_logger(__name__)
self.security_manager = security_manager
self.token_manager = token_manager # Token manager for token-bound DB config
self.session_cache = DorisSessionCache(self)
# FIX for Issue #58 Problem 1: Disable session caching to prevent connection sharing
# Session caching causes multiple threads to share the same MySQL connection,
# leading to race conditions and deadlocks in multi-threaded environments
# By disabling caching, each request gets a fresh connection from the pool
self.session_cache = DorisSessionCache(
self,
cache_system_session=False, # Disabled to prevent multi-thread issues
cache_user_session=False # Disabled to prevent multi-thread issues
)
# Store original database config for fallback
self.original_db_config = {
@@ -1258,17 +1269,43 @@ class DorisConnectionManager:
async def execute_query(
self, session_id: str, sql: str, params: tuple | None = None, auth_context=None
) -> QueryResult:
"""Execute query - Simplified Strategy with automatic connection management"""
"""Execute query - Simplified Strategy with automatic connection management
FIX for Issue #62 Bug 1: Configure token-bound database before query execution
"""
connection = None
try:
# Always get fresh connection from pool
# FIX: Configure database for token BEFORE getting connection
# This ensures token-bound database configuration is used instead of global config
if auth_context and hasattr(auth_context, 'token') and auth_context.token:
try:
success, config_source = await self.configure_for_token(auth_context.token)
if success:
self.logger.info(f"Session {session_id}: Using {config_source} database configuration")
else:
self.logger.warning(f"Session {session_id}: Token configuration failed, may use global config")
except Exception as token_config_error:
# SECURITY: If token should have config but configuration fails, don't fallback
# This prevents privilege escalation (using high-privilege default user)
if self.token_manager:
self.logger.error(f"Session {session_id}: Token database configuration failed: {token_config_error}")
raise RuntimeError(
f"Failed to configure database for authenticated token. "
f"This is a security measure to prevent using default high-privilege credentials. "
f"Error: {token_config_error}"
)
else:
# No token manager, can use global config
self.logger.warning(f"Session {session_id}: No token manager, using global config")
# Always get fresh connection from pool (with configured database)
connection = await self.get_connection(session_id)
# Execute query
result = await connection.execute(sql, params, auth_context)
return result
except Exception as e:
self.logger.error(f"Query execution failed for session {session_id}: {e}")
raise

View File

@@ -541,17 +541,21 @@ class DorisQueryExecutor:
await self.query_cache.clear_all()
async def execute_sql_for_mcp(
self,
sql: str,
limit: int = 1000,
self,
sql: str,
limit: int = 1000,
timeout: int = 30,
session_id: str = "mcp_session",
user_id: str = "mcp_user"
user_id: str = "mcp_user",
auth_context = None # FIX for Issue #62 Bug 1: Accept auth_context with token
) -> Dict[str, Any]:
"""Execute SQL query for MCP interface - unified method"""
"""Execute SQL query for MCP interface - unified method
FIX for Issue #62 Bug 1: Now accepts auth_context parameter to support token-bound database configuration
"""
max_retries = 2
retry_count = 0
while retry_count <= max_retries:
try:
if not sql:
@@ -564,14 +568,20 @@ class DorisQueryExecutor:
# Import required security modules
from .security import DorisSecurityManager, AuthContext, SecurityLevel
# Create proper auth context with read-only permissions
auth_context = AuthContext(
user_id=user_id,
roles=["read_only_user"], # Restrictive role for MCP interface
permissions=["read_data"], # Only read permissions
session_id=session_id,
security_level=SecurityLevel.INTERNAL
)
# FIX: Use provided auth_context if available (contains token for DB config)
# Otherwise create default auth context for backward compatibility
if auth_context is None:
auth_context = AuthContext(
user_id=user_id,
roles=["read_only_user"], # Restrictive role for MCP interface
permissions=["read_data"], # Only read permissions
session_id=session_id,
security_level=SecurityLevel.INTERNAL,
token="" # No token in default context
)
else:
# Use provided auth_context (may contain token for database configuration)
self.logger.debug(f"Using provided auth_context with token: {bool(hasattr(auth_context, 'token') and auth_context.token)}")
# Perform SQL security validation if enabled
if hasattr(self.connection_manager, 'config') and hasattr(self.connection_manager.config, 'security'):
@@ -579,7 +589,7 @@ class DorisQueryExecutor:
try:
security_manager = DorisSecurityManager(self.connection_manager.config)
validation_result = await security_manager.validate_sql_security(sql, auth_context)
if not validation_result.is_valid:
self.logger.warning(f"SQL security validation failed for query: {sql[:100]}...")
return {
@@ -877,33 +887,42 @@ class QueryPerformanceMonitor:
# Unified convenience function for MCP integration
async def execute_sql_query(sql: str, connection_manager: DorisConnectionManager, **kwargs) -> Dict[str, Any]:
"""Execute SQL query - unified convenience function for MCP tools
This function now includes security validation to ensure safe query execution.
All queries are validated against the configured security policies before execution.
FIX for Issue #62 Bug 1: Now supports auth_context parameter for token-bound database configuration
FIX for Issue #58 Problem 2: Removed executor.close() to prevent ClosedResourceError in multi-worker mode
"""
try:
# Create query executor with the connection manager's configuration
executor = DorisQueryExecutor(connection_manager)
try:
# Extract parameters from kwargs or use defaults
limit = kwargs.get("limit", 1000)
timeout = kwargs.get("timeout", 30)
session_id = kwargs.get("session_id", "mcp_session")
user_id = kwargs.get("user_id", "mcp_user")
# The execute_sql_for_mcp method now includes security validation
result = await executor.execute_sql_for_mcp(
sql=sql,
limit=limit,
timeout=timeout,
session_id=session_id,
user_id=user_id
)
return result
finally:
await executor.close()
# Extract parameters from kwargs or use defaults
limit = kwargs.get("limit", 1000)
timeout = kwargs.get("timeout", 30)
session_id = kwargs.get("session_id", "mcp_session")
user_id = kwargs.get("user_id", "mcp_user")
auth_context = kwargs.get("auth_context", None) # FIX: Extract auth_context
# The execute_sql_for_mcp method now includes security validation
result = await executor.execute_sql_for_mcp(
sql=sql,
limit=limit,
timeout=timeout,
session_id=session_id,
user_id=user_id,
auth_context=auth_context # FIX: Pass auth_context with token
)
# FIX for Issue #58 Problem 2: Do NOT close executor here
# In multi-worker mode, closing here causes ClosedResourceError
# The executor's resources (cache, background tasks) will be managed
# by the connection_manager lifecycle and Python's garbage collection
# This prevents premature cleanup while MCP session manager is still processing
return result
except Exception as e:
return {
"success": False,

View File

@@ -1454,32 +1454,83 @@ class MetadataExtractor:
return response_data
async def exec_query_for_mcp(
self,
sql: str,
db_name: str = None,
catalog_name: str = None,
max_rows: int = 100,
self,
sql: str,
db_name: str = None,
catalog_name: str = None,
max_rows: int = 100,
timeout: int = 30
) -> Dict[str, Any]:
"""
Execute SQL query and return results, supports catalog federation queries
Unified interface for MCP tools
FIX for Issue #62 Bug 1: Now retrieves auth_context from context variable to support token-bound database configuration
FIX for Issue #62 Bug 3: Now uses db_name and catalog_name parameters to switch database context
"""
logger.info(f"Executing SQL query: {sql}, DB: {db_name}, Catalog: {catalog_name}, MaxRows: {max_rows}, Timeout: {timeout}")
try:
if not sql:
return self._format_response(success=False, error="No SQL statement provided", message="Please provide SQL statement to execute")
# FIX for Issue #62 Bug 3: Build context switching SQL if db_name or catalog_name is specified
final_sql = sql
if catalog_name or db_name:
context_statements = []
if catalog_name:
# Switch to specified catalog
context_statements.append(f"USE CATALOG `{catalog_name}`")
logger.debug(f"Switching to catalog: {catalog_name}")
if db_name:
# Switch to specified database
if catalog_name:
context_statements.append(f"USE `{catalog_name}`.`{db_name}`")
else:
context_statements.append(f"USE `{db_name}`")
logger.debug(f"Switching to database: {db_name}")
# Combine context switching with original SQL
if context_statements:
# Remove trailing semicolon from context statements if present
context_sql = "; ".join(context_statements)
# Ensure original SQL doesn't start with semicolon
sql_clean = sql.lstrip(";").strip()
final_sql = f"{context_sql}; {sql_clean}"
logger.debug(f"Modified SQL with context switching: {final_sql[:200]}...")
# FIX: Try to get auth_context from context variable (set by HTTP middleware)
# This allows token-bound database configuration to work
auth_context = None
try:
from contextvars import ContextVar
from .security import AuthContext
# Try to get auth_context from context variable
# This will be set by the HTTP request handler in main.py
auth_context_var: ContextVar = ContextVar('mcp_auth_context', default=None)
auth_context = auth_context_var.get()
if auth_context:
logger.debug(f"Retrieved auth_context from context variable with token: {bool(hasattr(auth_context, 'token') and auth_context.token)}")
else:
logger.debug("No auth_context found in context variable, using default")
except Exception as ctx_error:
logger.debug(f"Could not retrieve auth_context from context variable: {ctx_error}")
auth_context = None
# Import query executor
from .query_executor import execute_sql_query
# Call execute_sql_query to execute query
# Call execute_sql_query to execute query with auth_context
exec_result = await execute_sql_query(
sql=sql,
sql=final_sql, # Use modified SQL with context switching
connection_manager=self.connection_manager,
limit=max_rows,
timeout=timeout
timeout=timeout,
auth_context=auth_context # FIX: Pass auth_context with token
)
return exec_result

View File

@@ -939,28 +939,69 @@ class SQLSecurityValidator:
async def _check_sql_injection(
self, sql: str, parsed: Statement
) -> ValidationResult:
"""Check SQL injection risks"""
# Check common SQL injection patterns
"""Check SQL injection risks with improved pattern detection
FIX for Issue #62 Bug 2: Improved patterns to reduce false positives
Now better distinguishes between legitimate SQL (like BETWEEN...AND) and injection attempts
"""
# Improved injection patterns that are more specific and less prone to false positives
injection_patterns = [
r"(?i)(?<![A-Za-z0-9_])(union|select|insert|update|delete|drop|create|alter)(?![A-Za-z0-9_])\s+[\s\S]*?\s+(?<![A-Za-z0-9_])(union|select|insert|update|delete|drop|create|alter)(?![A-Za-z0-9_])",
r"(\s|^)(or|and)\s+\d+\s*=\s*\d+",
r"(\s|^)(or|and)\s+['\"].*['\"]",
r";\s*(drop|delete|truncate|alter|create)",
r"(exec|execute|sp_|xp_)",
r"(script|javascript|vbscript)",
r"(char|ascii|substring|concat)\s*\(",
# Stacked queries with dangerous operations (true injection risk)
r";\s*(DROP|DELETE|TRUNCATE|ALTER|CREATE|INSERT|UPDATE)\s+",
# UNION-based injection (but allow legitimate UNION queries)
# Only flag if UNION is followed by suspicious patterns like SELECT with WHERE 1=1
r"UNION\s+(ALL\s+)?SELECT\s+.*\s+(WHERE|AND|OR)\s+\d+\s*=\s*\d+",
# Boolean-based blind injection with comments (true injection pattern)
r"(WHERE|AND|OR)\s+\d+\s*=\s*\d+\s*(--|#|/\*)",
# Quote-based injection attempts (but not in legitimate strings)
r"(WHERE|AND|OR)\s+(['\"])[^\2]*\2\s*=\s*\2[^\2]*\2",
# Time-based blind injection
r"(SLEEP|WAITFOR|BENCHMARK)\s*\(",
# System stored procedure injection
r"(EXEC|EXECUTE|SP_|XP_)\s*\(",
# Script injection attempts
r"<\s*(SCRIPT|JAVASCRIPT|VBSCRIPT)",
]
sql_lower = sql.lower()
# FIX: Don't flag legitimate SQL functions and keywords
# These patterns are too broad and cause false positives:
# - REMOVED: r"(char|ascii|substring|concat)\s*\(" - These are legitimate SQL functions
# - REMOVED: r"(\s|^)(or|and)\s+\d+\s*=\s*\d+" - This flags BETWEEN...AND constructs
# - REMOVED: r"(\s|^)(or|and)\s+['\"].*['\"]" - This is too broad
sql_upper = sql.upper()
# Special case: Allow BETWEEN...AND which is legitimate SQL
# This prevents false positives like "WHERE dt BETWEEN '2025-01-01' AND '2025-01-31'"
if "BETWEEN" in sql_upper and "AND" in sql_upper:
# This is likely a BETWEEN clause, not injection
# Check if AND appears in a BETWEEN context
between_pattern = r"BETWEEN\s+[^\s]+\s+AND\s+[^\s]+"
if re.search(between_pattern, sql_upper, re.IGNORECASE):
# Remove BETWEEN clauses before checking other patterns
sql_cleaned = re.sub(between_pattern, "BETWEEN_CLAUSE", sql_upper, flags=re.IGNORECASE)
sql_to_check = sql_cleaned
else:
sql_to_check = sql_upper
else:
sql_to_check = sql_upper
for pattern in injection_patterns:
if re.search(pattern, sql_lower, re.IGNORECASE):
if re.search(pattern, sql_to_check, re.IGNORECASE):
self.logger.warning(f"Potential SQL injection pattern detected: {pattern}")
return ValidationResult(
is_valid=False,
error_message="Potential SQL injection risk detected",
risk_level="high",
)
# Check suspicious quotes and comments
# Check suspicious quotes and comments (with improved detection)
if self._has_suspicious_quotes_or_comments(sql):
return ValidationResult(
is_valid=False,
@@ -971,19 +1012,67 @@ class SQLSecurityValidator:
return ValidationResult(is_valid=True)
def _has_suspicious_quotes_or_comments(self, sql: str) -> bool:
"""Check suspicious quote and comment patterns"""
# Check unmatched quotes
single_quotes = sql.count("'")
double_quotes = sql.count('"')
"""Check suspicious quote and comment patterns with improved detection
if single_quotes % 2 != 0 or double_quotes % 2 != 0:
return True
FIX for Issue #62 Bug 2: Improved detection to reduce false positives
Now distinguishes between legitimate comments/strings and injection attempts
"""
try:
# Use sqlparse to parse the SQL and distinguish between code and comments/strings
import sqlparse
from sqlparse.tokens import Comment, String
# Check SQL comments
if "--" in sql or "/*" in sql:
return True
# Parse the SQL
parsed = sqlparse.parse(sql)
if not parsed:
# If parsing fails, be conservative
return True
return False
statement = parsed[0]
# Check for unmatched quotes ONLY in non-string tokens
# This prevents false positives from legitimate string content
non_string_content = []
has_string_tokens = False
for token in statement.flatten():
if token.ttype in (String.Single, String.Double):
has_string_tokens = True
# Skip string content - quotes inside strings are legitimate
continue
elif token.ttype in (Comment.Single, Comment.Multi):
# Comments are generally OK, but check for suspicious injection patterns
comment_value = str(token).lower()
# Check if comment contains dangerous SQL keywords
dangerous_in_comments = ['drop', 'delete', 'insert', 'update', 'union', 'exec', 'execute']
if any(keyword in comment_value for keyword in dangerous_in_comments):
self.logger.warning(f"Suspicious SQL keyword in comment: {token}")
return True
# Normal comments are OK
continue
else:
# Accumulate non-string, non-comment content
non_string_content.append(str(token))
# Check for unmatched quotes in non-string content
non_string_text = ''.join(non_string_content)
single_quotes = non_string_text.count("'")
double_quotes = non_string_text.count('"')
# Only flag if there are unmatched quotes in actual SQL code (not in strings)
if single_quotes % 2 != 0 or double_quotes % 2 != 0:
return True
# FIX: Don't flag legitimate SQL comments
# Comments are OK as long as they don't contain dangerous patterns (already checked above)
return False
except Exception as e:
self.logger.debug(f"SQL parsing error in quote/comment check: {e}")
# On parsing error, fall back to conservative check
# But be more lenient than before
return False # Don't flag on parse errors to reduce false positives
async def _check_blocked_keywords(self, parsed: Statement) -> ValidationResult:
"""Check blocked keywords"""