at_eof bug fix
This commit is contained in:
@@ -70,6 +70,7 @@ class DorisConnection:
|
|||||||
self.query_count = 0
|
self.query_count = 0
|
||||||
self.is_healthy = True
|
self.is_healthy = True
|
||||||
self.security_manager = security_manager
|
self.security_manager = security_manager
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
async def execute(self, sql: str, params: tuple | None = None, auth_context=None) -> QueryResult:
|
async def execute(self, sql: str, params: tuple | None = None, auth_context=None) -> QueryResult:
|
||||||
"""Execute SQL query"""
|
"""Execute SQL query"""
|
||||||
@@ -135,31 +136,88 @@ class DorisConnection:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
async def ping(self) -> bool:
|
async def ping(self) -> bool:
|
||||||
"""Check connection health status"""
|
"""Check connection health status with enhanced at_eof error detection"""
|
||||||
try:
|
try:
|
||||||
# Check if connection exists and is not closed
|
# Check 1: Connection exists and is not closed
|
||||||
if not self.connection or self.connection.closed:
|
if not self.connection or self.connection.closed:
|
||||||
self.is_healthy = False
|
self.is_healthy = False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check if connection has _reader (aiomysql internal state)
|
# Check 2: Comprehensive internal state validation
|
||||||
# This prevents the 'NoneType' object has no attribute 'at_eof' error
|
# This is critical for detecting at_eof issues before they cause errors
|
||||||
if not hasattr(self.connection, '_reader') or self.connection._reader is None:
|
if not hasattr(self.connection, '_reader') or self.connection._reader is None:
|
||||||
|
self.logger.debug(f"Connection {self.session_id} has invalid _reader state")
|
||||||
self.is_healthy = False
|
self.is_healthy = False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Additional check for reader's state
|
# Check 3: Verify transport state
|
||||||
if hasattr(self.connection._reader, '_transport') and self.connection._reader._transport is None:
|
if (hasattr(self.connection._reader, '_transport') and
|
||||||
|
self.connection._reader._transport is None):
|
||||||
|
self.logger.debug(f"Connection {self.session_id} has invalid transport state")
|
||||||
self.is_healthy = False
|
self.is_healthy = False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Try to ping the connection
|
# Check 4: Additional stream state validation
|
||||||
await self.connection.ping()
|
if (hasattr(self.connection._reader, 'at_eof') and
|
||||||
|
callable(self.connection._reader.at_eof)):
|
||||||
|
try:
|
||||||
|
# If the stream is already at EOF, the connection is broken
|
||||||
|
if self.connection._reader.at_eof():
|
||||||
|
self.logger.debug(f"Connection {self.session_id} reader is at EOF")
|
||||||
|
self.is_healthy = False
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
# If we can't even check at_eof, the connection is problematic
|
||||||
|
self.logger.debug(f"Connection {self.session_id} cannot check at_eof state")
|
||||||
|
self.is_healthy = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check 5: Try to ping the connection with timeout
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(self.connection.ping(), timeout=5)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self.logger.debug(f"Connection {self.session_id} ping timeout")
|
||||||
|
self.is_healthy = False
|
||||||
|
return False
|
||||||
|
except Exception as ping_error:
|
||||||
|
# Check for specific error patterns
|
||||||
|
error_str = str(ping_error).lower()
|
||||||
|
if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
|
||||||
|
self.logger.debug(f"Connection {self.session_id} ping failed with connection state error: {ping_error}")
|
||||||
|
else:
|
||||||
|
self.logger.debug(f"Connection {self.session_id} ping failed: {ping_error}")
|
||||||
|
self.is_healthy = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check 6: Final validation with a simple query
|
||||||
|
try:
|
||||||
|
async with self.connection.cursor() as cursor:
|
||||||
|
await asyncio.wait_for(cursor.execute("SELECT 1"), timeout=3)
|
||||||
|
result = await asyncio.wait_for(cursor.fetchone(), timeout=3)
|
||||||
|
if not result or result[0] != 1:
|
||||||
|
self.logger.debug(f"Connection {self.session_id} test query returned invalid result")
|
||||||
|
self.is_healthy = False
|
||||||
|
return False
|
||||||
|
except Exception as query_error:
|
||||||
|
error_str = str(query_error).lower()
|
||||||
|
if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
|
||||||
|
self.logger.debug(f"Connection {self.session_id} test query failed with connection state error: {query_error}")
|
||||||
|
else:
|
||||||
|
self.logger.debug(f"Connection {self.session_id} test query failed: {query_error}")
|
||||||
|
self.is_healthy = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
# If all checks pass, the connection is healthy
|
||||||
self.is_healthy = True
|
self.is_healthy = True
|
||||||
return True
|
return True
|
||||||
except (AttributeError, OSError, ConnectionError, Exception) as e:
|
|
||||||
# Log the specific error for debugging
|
except Exception as e:
|
||||||
logging.debug(f"Connection ping failed for session {self.session_id}: {e}")
|
# Any uncaught exception means the connection is not healthy
|
||||||
|
error_str = str(e).lower()
|
||||||
|
if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
|
||||||
|
self.logger.debug(f"Connection {self.session_id} ping failed with connection state error: {e}")
|
||||||
|
else:
|
||||||
|
self.logger.debug(f"Connection {self.session_id} ping failed with unexpected error: {e}")
|
||||||
self.is_healthy = False
|
self.is_healthy = False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -188,11 +246,15 @@ class DorisConnectionManager:
|
|||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self.security_manager = security_manager
|
self.security_manager = security_manager
|
||||||
|
|
||||||
# Health check configuration
|
# Enhanced health check configuration for long-connection issues
|
||||||
self.health_check_interval = config.database.health_check_interval or 60
|
# Reduce health check interval to detect stale connections faster
|
||||||
|
self.health_check_interval = min(config.database.health_check_interval or 60, 30) # Max 30 seconds
|
||||||
self.max_connection_age = config.database.max_connection_age or 3600
|
self.max_connection_age = config.database.max_connection_age or 3600
|
||||||
self.connection_timeout = config.database.connection_timeout or 30
|
self.connection_timeout = config.database.connection_timeout or 30
|
||||||
|
|
||||||
|
# Add stale connection detection threshold (much shorter than MySQL's wait_timeout)
|
||||||
|
self.stale_connection_threshold = 900 # 15 minutes - connections older than this are considered stale
|
||||||
|
|
||||||
# Start background tasks
|
# Start background tasks
|
||||||
self._health_check_task = None
|
self._health_check_task = None
|
||||||
self._cleanup_task = None
|
self._cleanup_task = None
|
||||||
@@ -210,8 +272,11 @@ class DorisConnectionManager:
|
|||||||
if not self.config.database.password:
|
if not self.config.database.password:
|
||||||
self.logger.warning("Database password is empty, this may cause connection issues")
|
self.logger.warning("Database password is empty, this may cause connection issues")
|
||||||
|
|
||||||
# Create connection pool with improved stability parameters
|
# Create connection pool with aggressive connection recycling to prevent at_eof issues
|
||||||
# Key change: Set minsize=0 to avoid pre-creation issues that cause at_eof errors
|
# Key changes:
|
||||||
|
# 1. Reduce pool_recycle to 30 minutes (1800 seconds) - much shorter than MySQL's wait_timeout
|
||||||
|
# 2. Add shorter connect_timeout to fail fast on bad connections
|
||||||
|
# 3. Enable autocommit to avoid transaction state issues
|
||||||
self.pool = await aiomysql.create_pool(
|
self.pool = await aiomysql.create_pool(
|
||||||
host=self.config.database.host,
|
host=self.config.database.host,
|
||||||
port=self.config.database.port,
|
port=self.config.database.port,
|
||||||
@@ -220,12 +285,11 @@ class DorisConnectionManager:
|
|||||||
db=self.config.database.database,
|
db=self.config.database.database,
|
||||||
charset="utf8",
|
charset="utf8",
|
||||||
minsize=self.config.database.min_connections, # Always 0 per configuration to avoid at_eof issues
|
minsize=self.config.database.min_connections, # Always 0 per configuration to avoid at_eof issues
|
||||||
|
|
||||||
maxsize=self.config.database.max_connections or 20,
|
maxsize=self.config.database.max_connections or 20,
|
||||||
autocommit=True,
|
autocommit=True,
|
||||||
connect_timeout=self.connection_timeout,
|
connect_timeout=15, # Shorter timeout to fail fast
|
||||||
# Enhanced stability parameters
|
# Aggressive connection recycling to prevent stale connections
|
||||||
pool_recycle=7200, # Recycle connections every 2 hours
|
pool_recycle=1800, # Recycle connections every 30 minutes instead of 2 hours
|
||||||
echo=False, # Don't echo SQL statements
|
echo=False, # Don't echo SQL statements
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -234,13 +298,12 @@ class DorisConnectionManager:
|
|||||||
raise RuntimeError("Connection pool robust test failed")
|
raise RuntimeError("Connection pool robust test failed")
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Connection pool initialized successfully with on-demand connection creation, "
|
f"Connection pool initialized successfully with aggressive recycling (30min), "
|
||||||
f"min connections: {self.config.database.min_connections}, "
|
f"min connections: {self.config.database.min_connections}, "
|
||||||
|
|
||||||
f"max connections: {self.config.database.max_connections or 20}"
|
f"max connections: {self.config.database.max_connections or 20}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Start background monitoring tasks
|
# Start background monitoring tasks with more frequent health checks
|
||||||
self._health_check_task = asyncio.create_task(self._health_check_loop())
|
self._health_check_task = asyncio.create_task(self._health_check_loop())
|
||||||
self._cleanup_task = asyncio.create_task(self._cleanup_loop())
|
self._cleanup_task = asyncio.create_task(self._cleanup_loop())
|
||||||
|
|
||||||
@@ -312,10 +375,25 @@ class DorisConnectionManager:
|
|||||||
self.logger.warning(f"Pool returned closed connection (attempt {attempt + 1})")
|
self.logger.warning(f"Pool returned closed connection (attempt {attempt + 1})")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Perform a simple ping test instead of checking internal state
|
# Enhanced connection validation with multiple checks
|
||||||
# Internal state (_reader, _transport) might not be fully initialized yet
|
|
||||||
try:
|
try:
|
||||||
# Test basic connectivity with a simple query
|
# Check 1: Verify connection object internal state
|
||||||
|
if not hasattr(raw_connection, '_reader') or raw_connection._reader is None:
|
||||||
|
self.logger.warning(f"Connection has invalid _reader state (attempt {attempt + 1})")
|
||||||
|
await raw_connection.ensure_closed()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check 2: Verify transport state
|
||||||
|
if (hasattr(raw_connection._reader, '_transport') and
|
||||||
|
raw_connection._reader._transport is None):
|
||||||
|
self.logger.warning(f"Connection has invalid transport state (attempt {attempt + 1})")
|
||||||
|
await raw_connection.ensure_closed()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check 3: Perform ping test to verify server-side connectivity
|
||||||
|
await raw_connection.ping()
|
||||||
|
|
||||||
|
# Check 4: Test with actual query execution
|
||||||
async with raw_connection.cursor() as cursor:
|
async with raw_connection.cursor() as cursor:
|
||||||
await cursor.execute("SELECT 1")
|
await cursor.execute("SELECT 1")
|
||||||
result = await cursor.fetchone()
|
result = await cursor.fetchone()
|
||||||
@@ -328,17 +406,27 @@ class DorisConnectionManager:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Check if this is an at_eof error specifically
|
# Enhanced error detection for connection issues
|
||||||
error_str = str(e).lower()
|
error_str = str(e).lower()
|
||||||
if 'at_eof' in error_str or 'nonetype' in error_str:
|
|
||||||
self.logger.warning(f"Connection has at_eof issue (attempt {attempt + 1}): {e}")
|
# Check for various connection-related errors
|
||||||
|
connection_error_keywords = [
|
||||||
|
'at_eof', 'nonetype', 'connection', 'transport', 'reader',
|
||||||
|
'lost connection', 'broken pipe', 'connection reset',
|
||||||
|
'timed out', 'connection refused', 'host unreachable'
|
||||||
|
]
|
||||||
|
|
||||||
|
is_connection_error = any(keyword in error_str for keyword in connection_error_keywords)
|
||||||
|
|
||||||
|
if is_connection_error:
|
||||||
|
self.logger.warning(f"Connection validation failed with connection error (attempt {attempt + 1}): {e}")
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"Connection test failed (attempt {attempt + 1}): {e}")
|
self.logger.warning(f"Connection validation failed (attempt {attempt + 1}): {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await raw_connection.ensure_closed()
|
await raw_connection.ensure_closed()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass # Ignore cleanup errors
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -346,8 +434,10 @@ class DorisConnectionManager:
|
|||||||
if attempt == max_retries - 1:
|
if attempt == max_retries - 1:
|
||||||
raise RuntimeError(f"Failed to create valid connection after {max_retries} attempts: {e}")
|
raise RuntimeError(f"Failed to create valid connection after {max_retries} attempts: {e}")
|
||||||
else:
|
else:
|
||||||
# Exponential backoff
|
# Exponential backoff with jitter to avoid thundering herd
|
||||||
await asyncio.sleep(0.5 * (2 ** attempt))
|
base_delay = 0.5 * (2 ** attempt)
|
||||||
|
jitter = base_delay * 0.1 * (0.5 - asyncio.get_running_loop().time() % 1)
|
||||||
|
await asyncio.sleep(base_delay + jitter)
|
||||||
|
|
||||||
raise RuntimeError("Failed to create valid connection")
|
raise RuntimeError("Failed to create valid connection")
|
||||||
|
|
||||||
@@ -505,42 +595,85 @@ class DorisConnectionManager:
|
|||||||
self.logger.error(f"Health check error: {e}")
|
self.logger.error(f"Health check error: {e}")
|
||||||
|
|
||||||
async def _perform_health_check(self):
|
async def _perform_health_check(self):
|
||||||
"""Perform enhanced health check"""
|
"""Perform enhanced health check with aggressive stale connection detection"""
|
||||||
try:
|
try:
|
||||||
unhealthy_sessions = []
|
unhealthy_sessions = []
|
||||||
|
stale_sessions = []
|
||||||
|
current_time = datetime.utcnow()
|
||||||
|
|
||||||
# Enhanced health check with comprehensive validation
|
# Enhanced health check with comprehensive validation
|
||||||
for session_id, conn in self.session_connections.items():
|
for session_id, conn in self.session_connections.items():
|
||||||
if not await self._comprehensive_connection_health_check(conn):
|
try:
|
||||||
unhealthy_sessions.append(session_id)
|
# Check 1: Basic connection health
|
||||||
|
if not await self._comprehensive_connection_health_check(conn):
|
||||||
|
unhealthy_sessions.append(session_id)
|
||||||
|
self.logger.debug(f"Session {session_id} marked as unhealthy")
|
||||||
|
continue
|
||||||
|
|
||||||
# Check for stale connections (over 30 minutes old)
|
# Check 2: Stale connection detection (much more aggressive)
|
||||||
current_time = datetime.utcnow()
|
time_since_last_use = (current_time - conn.last_used).total_seconds()
|
||||||
stale_sessions = []
|
connection_age = (current_time - conn.created_at).total_seconds()
|
||||||
for session_id, conn in self.session_connections.items():
|
|
||||||
if session_id not in unhealthy_sessions: # Don't double-check
|
# Mark as stale if:
|
||||||
last_used_delta = (current_time - conn.last_used).total_seconds()
|
# 1. Last used more than 15 minutes ago, OR
|
||||||
if last_used_delta > 1800: # 30 minutes
|
# 2. Connection age exceeds maximum age, OR
|
||||||
# Force a comprehensive health check for stale connections
|
# 3. Connection hasn't been used in a while and is old
|
||||||
if not await self._comprehensive_connection_health_check(conn):
|
if (time_since_last_use > self.stale_connection_threshold or
|
||||||
|
connection_age > self.max_connection_age or
|
||||||
|
(time_since_last_use > 300 and connection_age > 1800)): # 5 min unused + 30 min old
|
||||||
|
|
||||||
|
# For stale connections, do an extra validation
|
||||||
|
try:
|
||||||
|
# Try a more aggressive ping test
|
||||||
|
async with conn.connection.cursor() as cursor:
|
||||||
|
await asyncio.wait_for(cursor.execute("SELECT 1"), timeout=3)
|
||||||
|
await asyncio.wait_for(cursor.fetchone(), timeout=3)
|
||||||
|
# If we get here, connection is actually healthy despite being stale
|
||||||
|
self.logger.debug(f"Stale connection {session_id} passed extra validation")
|
||||||
|
except Exception as stale_test_error:
|
||||||
stale_sessions.append(session_id)
|
stale_sessions.append(session_id)
|
||||||
|
self.logger.debug(f"Session {session_id} marked as stale: {stale_test_error}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as check_error:
|
||||||
|
# If we can't even check the connection, it's definitely problematic
|
||||||
|
self.logger.warning(f"Health check failed for session {session_id}: {check_error}")
|
||||||
|
unhealthy_sessions.append(session_id)
|
||||||
|
|
||||||
all_problematic_sessions = list(set(unhealthy_sessions + stale_sessions))
|
all_problematic_sessions = list(set(unhealthy_sessions + stale_sessions))
|
||||||
|
|
||||||
# Clean up problematic connections
|
# Clean up problematic connections
|
||||||
|
cleanup_results = {"success": 0, "failed": 0}
|
||||||
for session_id in all_problematic_sessions:
|
for session_id in all_problematic_sessions:
|
||||||
await self._cleanup_session_connection(session_id)
|
try:
|
||||||
self.metrics.failed_connections += 1
|
await self._cleanup_session_connection(session_id)
|
||||||
|
cleanup_results["success"] += 1
|
||||||
|
self.metrics.failed_connections += 1
|
||||||
|
except Exception as cleanup_error:
|
||||||
|
cleanup_results["failed"] += 1
|
||||||
|
self.logger.error(f"Failed to cleanup session {session_id}: {cleanup_error}")
|
||||||
|
|
||||||
# Update metrics
|
# Update metrics
|
||||||
await self._update_connection_metrics()
|
await self._update_connection_metrics()
|
||||||
self.metrics.last_health_check = datetime.utcnow()
|
self.metrics.last_health_check = datetime.utcnow()
|
||||||
|
|
||||||
|
# Log results
|
||||||
if all_problematic_sessions:
|
if all_problematic_sessions:
|
||||||
self.logger.warning(f"Health check: cleaned up {len(unhealthy_sessions)} unhealthy and {len(stale_sessions)} stale connections")
|
self.logger.warning(
|
||||||
|
f"Health check: cleaned up {len(unhealthy_sessions)} unhealthy and "
|
||||||
|
f"{len(stale_sessions)} stale connections "
|
||||||
|
f"(success: {cleanup_results['success']}, failed: {cleanup_results['failed']})"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.logger.debug(f"Health check: all {len(self.session_connections)} connections healthy")
|
self.logger.debug(f"Health check: all {len(self.session_connections)} connections healthy")
|
||||||
|
|
||||||
|
# If we have a lot of connection failures, log some diagnostic info
|
||||||
|
if self.metrics.connection_errors > 50: # Threshold for diagnostic logging
|
||||||
|
self.logger.warning(
|
||||||
|
f"High connection error count detected: {self.metrics.connection_errors}. "
|
||||||
|
f"This may indicate persistent connectivity issues with the database."
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Health check failed: {e}")
|
self.logger.error(f"Health check failed: {e}")
|
||||||
# If health check fails, try to diagnose the issue
|
# If health check fails, try to diagnose the issue
|
||||||
@@ -551,10 +684,11 @@ class DorisConnectionManager:
|
|||||||
pass # Don't let diagnosis failure crash health check
|
pass # Don't let diagnosis failure crash health check
|
||||||
|
|
||||||
async def _cleanup_loop(self):
|
async def _cleanup_loop(self):
|
||||||
"""Background cleanup loop"""
|
"""Background cleanup loop with more frequent execution"""
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
await asyncio.sleep(300) # Run every 5 minutes
|
# Run cleanup more frequently - every 2 minutes instead of 5
|
||||||
|
await asyncio.sleep(120) # Run every 2 minutes
|
||||||
await self._cleanup_idle_connections()
|
await self._cleanup_idle_connections()
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
break
|
break
|
||||||
@@ -562,22 +696,69 @@ class DorisConnectionManager:
|
|||||||
self.logger.error(f"Cleanup loop error: {e}")
|
self.logger.error(f"Cleanup loop error: {e}")
|
||||||
|
|
||||||
async def _cleanup_idle_connections(self):
|
async def _cleanup_idle_connections(self):
|
||||||
"""Clean up idle connections"""
|
"""Clean up idle connections with more aggressive criteria"""
|
||||||
current_time = datetime.utcnow()
|
current_time = datetime.utcnow()
|
||||||
idle_sessions = []
|
idle_sessions = []
|
||||||
|
|
||||||
for session_id, conn in self.session_connections.items():
|
for session_id, conn in self.session_connections.items():
|
||||||
# Check if connection has exceeded maximum age
|
try:
|
||||||
age = (current_time - conn.created_at).total_seconds()
|
# Enhanced idle connection detection
|
||||||
if age > self.max_connection_age:
|
connection_age = (current_time - conn.created_at).total_seconds()
|
||||||
|
time_since_last_use = (current_time - conn.last_used).total_seconds()
|
||||||
|
|
||||||
|
# Mark as idle if:
|
||||||
|
# 1. Connection has exceeded maximum age, OR
|
||||||
|
# 2. Connection hasn't been used for more than 20 minutes, OR
|
||||||
|
# 3. Connection is old and hasn't been used recently
|
||||||
|
should_cleanup = (
|
||||||
|
connection_age > self.max_connection_age or
|
||||||
|
time_since_last_use > 1200 or # 20 minutes unused
|
||||||
|
(connection_age > 1800 and time_since_last_use > 600) # 30 min old + 10 min unused
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_cleanup:
|
||||||
|
# Before marking for cleanup, try a quick health check
|
||||||
|
try:
|
||||||
|
# Quick validation - if this fails, definitely cleanup
|
||||||
|
if not conn.connection or conn.connection.closed:
|
||||||
|
idle_sessions.append(session_id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Quick ping test with timeout
|
||||||
|
await asyncio.wait_for(conn.connection.ping(), timeout=2)
|
||||||
|
|
||||||
|
# If ping succeeds but connection is still very old, cleanup anyway
|
||||||
|
if connection_age > self.max_connection_age:
|
||||||
|
idle_sessions.append(session_id)
|
||||||
|
self.logger.debug(f"Cleaning up old but healthy connection for session {session_id}")
|
||||||
|
else:
|
||||||
|
self.logger.debug(f"Keeping healthy connection for session {session_id}")
|
||||||
|
|
||||||
|
except Exception as health_error:
|
||||||
|
# Health check failed, definitely cleanup
|
||||||
|
idle_sessions.append(session_id)
|
||||||
|
self.logger.debug(f"Cleanup marking session {session_id} due to health check failure: {health_error}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error checking connection {session_id} for cleanup: {e}")
|
||||||
|
# If we can't even check it, it's probably broken
|
||||||
idle_sessions.append(session_id)
|
idle_sessions.append(session_id)
|
||||||
|
|
||||||
# Clean up idle connections
|
# Clean up idle connections
|
||||||
|
cleanup_results = {"success": 0, "failed": 0}
|
||||||
for session_id in idle_sessions:
|
for session_id in idle_sessions:
|
||||||
await self._cleanup_session_connection(session_id)
|
try:
|
||||||
|
await self._cleanup_session_connection(session_id)
|
||||||
|
cleanup_results["success"] += 1
|
||||||
|
except Exception as cleanup_error:
|
||||||
|
cleanup_results["failed"] += 1
|
||||||
|
self.logger.error(f"Failed to cleanup idle session {session_id}: {cleanup_error}")
|
||||||
|
|
||||||
if idle_sessions:
|
if idle_sessions:
|
||||||
self.logger.info(f"Cleaned up {len(idle_sessions)} idle connections")
|
self.logger.info(
|
||||||
|
f"Cleaned up {len(idle_sessions)} idle connections "
|
||||||
|
f"(success: {cleanup_results['success']}, failed: {cleanup_results['failed']})"
|
||||||
|
)
|
||||||
|
|
||||||
async def _update_connection_metrics(self):
|
async def _update_connection_metrics(self):
|
||||||
"""Update connection metrics"""
|
"""Update connection metrics"""
|
||||||
|
|||||||
Reference in New Issue
Block a user