at_eof bug fix

2025-07-03 21:31:54 +08:00
parent 693c48d5ee
commit aa953e9fe1
1 changed files with 238 additions and 57 deletions
--- a/doris_mcp_server/utils/db.py
+++ b/doris_mcp_server/utils/db.py
@@ -70,6 +70,7 @@ class DorisConnection:
        self.query_count = 0
        self.is_healthy = True
        self.security_manager = security_manager
+        self.logger = logging.getLogger(__name__)

    async def execute(self, sql: str, params: tuple | None = None, auth_context=None) -> QueryResult:
        """Execute SQL query"""
@@ -135,31 +136,88 @@ class DorisConnection:
            raise

    async def ping(self) -> bool:
-        """Check connection health status"""
+        """Check connection health status with enhanced at_eof error detection"""
        try:
-            # Check if connection exists and is not closed
+            # Check 1: Connection exists and is not closed
            if not self.connection or self.connection.closed:
                self.is_healthy = False
                return False
            
-            # Check if connection has _reader (aiomysql internal state)
-            # This prevents the 'NoneType' object has no attribute 'at_eof' error
+            # Check 2: Comprehensive internal state validation
+            # This is critical for detecting at_eof issues before they cause errors
            if not hasattr(self.connection, '_reader') or self.connection._reader is None:
+                self.logger.debug(f"Connection {self.session_id} has invalid _reader state")
                self.is_healthy = False
                return False
            
-            # Additional check for reader's state
-            if hasattr(self.connection._reader, '_transport') and self.connection._reader._transport is None:
+            # Check 3: Verify transport state
+            if (hasattr(self.connection._reader, '_transport') and 
+                self.connection._reader._transport is None):
+                self.logger.debug(f"Connection {self.session_id} has invalid transport state")
                self.is_healthy = False
                return False
            
-            # Try to ping the connection
-            await self.connection.ping()
+            # Check 4: Additional stream state validation
+            if (hasattr(self.connection._reader, 'at_eof') and 
+                callable(self.connection._reader.at_eof)):
+                try:
+                    # If the stream is already at EOF, the connection is broken
+                    if self.connection._reader.at_eof():
+                        self.logger.debug(f"Connection {self.session_id} reader is at EOF")
+                        self.is_healthy = False
+                        return False
+                except Exception:
+                    # If we can't even check at_eof, the connection is problematic
+                    self.logger.debug(f"Connection {self.session_id} cannot check at_eof state")
+                    self.is_healthy = False
+                    return False
+            
+            # Check 5: Try to ping the connection with timeout
+            try:
+                await asyncio.wait_for(self.connection.ping(), timeout=5)
+            except asyncio.TimeoutError:
+                self.logger.debug(f"Connection {self.session_id} ping timeout")
+                self.is_healthy = False
+                return False
+            except Exception as ping_error:
+                # Check for specific error patterns
+                error_str = str(ping_error).lower()
+                if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
+                    self.logger.debug(f"Connection {self.session_id} ping failed with connection state error: {ping_error}")
+                else:
+                    self.logger.debug(f"Connection {self.session_id} ping failed: {ping_error}")
+                self.is_healthy = False
+                return False
+            
+            # Check 6: Final validation with a simple query
+            try:
+                async with self.connection.cursor() as cursor:
+                    await asyncio.wait_for(cursor.execute("SELECT 1"), timeout=3)
+                    result = await asyncio.wait_for(cursor.fetchone(), timeout=3)
+                    if not result or result[0] != 1:
+                        self.logger.debug(f"Connection {self.session_id} test query returned invalid result")
+                        self.is_healthy = False
+                        return False
+            except Exception as query_error:
+                error_str = str(query_error).lower()
+                if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
+                    self.logger.debug(f"Connection {self.session_id} test query failed with connection state error: {query_error}")
+                else:
+                    self.logger.debug(f"Connection {self.session_id} test query failed: {query_error}")
+                self.is_healthy = False
+                return False
+            
+            # If all checks pass, the connection is healthy
            self.is_healthy = True
            return True
-        except (AttributeError, OSError, ConnectionError, Exception) as e:
-            # Log the specific error for debugging
-            logging.debug(f"Connection ping failed for session {self.session_id}: {e}")
+            
+        except Exception as e:
+            # Any uncaught exception means the connection is not healthy
+            error_str = str(e).lower()
+            if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
+                self.logger.debug(f"Connection {self.session_id} ping failed with connection state error: {e}")
+            else:
+                self.logger.debug(f"Connection {self.session_id} ping failed with unexpected error: {e}")
            self.is_healthy = False
            return False

@@ -188,11 +246,15 @@ class DorisConnectionManager:
        self.logger = logging.getLogger(__name__)
        self.security_manager = security_manager

-        # Health check configuration
-        self.health_check_interval = config.database.health_check_interval or 60
+        # Enhanced health check configuration for long-connection issues
+        # Reduce health check interval to detect stale connections faster
+        self.health_check_interval = min(config.database.health_check_interval or 60, 30)  # Max 30 seconds
        self.max_connection_age = config.database.max_connection_age or 3600
        self.connection_timeout = config.database.connection_timeout or 30
        
+        # Add stale connection detection threshold (much shorter than MySQL's wait_timeout)
+        self.stale_connection_threshold = 900  # 15 minutes - connections older than this are considered stale
+        
        # Start background tasks
        self._health_check_task = None
        self._cleanup_task = None
@@ -210,8 +272,11 @@ class DorisConnectionManager:
            if not self.config.database.password:
                self.logger.warning("Database password is empty, this may cause connection issues")
            
-            # Create connection pool with improved stability parameters
-            # Key change: Set minsize=0 to avoid pre-creation issues that cause at_eof errors
+            # Create connection pool with aggressive connection recycling to prevent at_eof issues
+            # Key changes:
+            # 1. Reduce pool_recycle to 30 minutes (1800 seconds) - much shorter than MySQL's wait_timeout
+            # 2. Add shorter connect_timeout to fail fast on bad connections
+            # 3. Enable autocommit to avoid transaction state issues
            self.pool = await aiomysql.create_pool(
                host=self.config.database.host,
                port=self.config.database.port,
@@ -220,12 +285,11 @@ class DorisConnectionManager:
                db=self.config.database.database,
                charset="utf8",
                minsize=self.config.database.min_connections,  # Always 0 per configuration to avoid at_eof issues
-
                maxsize=self.config.database.max_connections or 20,
                autocommit=True,
-                connect_timeout=self.connection_timeout,
-                # Enhanced stability parameters
-                pool_recycle=7200,  # Recycle connections every 2 hours
+                connect_timeout=15,  # Shorter timeout to fail fast
+                # Aggressive connection recycling to prevent stale connections
+                pool_recycle=1800,  # Recycle connections every 30 minutes instead of 2 hours
                echo=False,  # Don't echo SQL statements
            )

@@ -234,13 +298,12 @@ class DorisConnectionManager:
                raise RuntimeError("Connection pool robust test failed")

            self.logger.info(
-                f"Connection pool initialized successfully with on-demand connection creation, "
+                f"Connection pool initialized successfully with aggressive recycling (30min), "
                f"min connections: {self.config.database.min_connections}, "
-
                f"max connections: {self.config.database.max_connections or 20}"
            )

-            # Start background monitoring tasks
+            # Start background monitoring tasks with more frequent health checks
            self._health_check_task = asyncio.create_task(self._health_check_loop())
            self._cleanup_task = asyncio.create_task(self._cleanup_loop())

@@ -312,10 +375,25 @@ class DorisConnectionManager:
                    self.logger.warning(f"Pool returned closed connection (attempt {attempt + 1})")
                    continue
                
-                # Perform a simple ping test instead of checking internal state
-                # Internal state (_reader, _transport) might not be fully initialized yet
+                # Enhanced connection validation with multiple checks
                try:
-                    # Test basic connectivity with a simple query
+                    # Check 1: Verify connection object internal state
+                    if not hasattr(raw_connection, '_reader') or raw_connection._reader is None:
+                        self.logger.warning(f"Connection has invalid _reader state (attempt {attempt + 1})")
+                        await raw_connection.ensure_closed()
+                        continue
+                        
+                    # Check 2: Verify transport state
+                    if (hasattr(raw_connection._reader, '_transport') and 
+                        raw_connection._reader._transport is None):
+                        self.logger.warning(f"Connection has invalid transport state (attempt {attempt + 1})")
+                        await raw_connection.ensure_closed()
+                        continue
+                    
+                    # Check 3: Perform ping test to verify server-side connectivity
+                    await raw_connection.ping()
+                    
+                    # Check 4: Test with actual query execution
                    async with raw_connection.cursor() as cursor:
                        await cursor.execute("SELECT 1")
                        result = await cursor.fetchone()
@@ -328,17 +406,27 @@ class DorisConnectionManager:
                            continue
                            
                except Exception as e:
-                    # Check if this is an at_eof error specifically
+                    # Enhanced error detection for connection issues
                    error_str = str(e).lower()
-                    if 'at_eof' in error_str or 'nonetype' in error_str:
-                        self.logger.warning(f"Connection has at_eof issue (attempt {attempt + 1}): {e}")
+                    
+                    # Check for various connection-related errors
+                    connection_error_keywords = [
+                        'at_eof', 'nonetype', 'connection', 'transport', 'reader', 
+                        'lost connection', 'broken pipe', 'connection reset',
+                        'timed out', 'connection refused', 'host unreachable'
+                    ]
+                    
+                    is_connection_error = any(keyword in error_str for keyword in connection_error_keywords)
+                    
+                    if is_connection_error:
+                        self.logger.warning(f"Connection validation failed with connection error (attempt {attempt + 1}): {e}")
                    else:
-                        self.logger.warning(f"Connection test failed (attempt {attempt + 1}): {e}")
+                        self.logger.warning(f"Connection validation failed (attempt {attempt + 1}): {e}")
                    
                    try:
                        await raw_connection.ensure_closed()
                    except Exception:
-                        pass
+                        pass  # Ignore cleanup errors
                    continue
                
            except Exception as e:
@@ -346,8 +434,10 @@ class DorisConnectionManager:
                if attempt == max_retries - 1:
                    raise RuntimeError(f"Failed to create valid connection after {max_retries} attempts: {e}")
                else:
-                    # Exponential backoff
-                    await asyncio.sleep(0.5 * (2 ** attempt))
+                    # Exponential backoff with jitter to avoid thundering herd
+                    base_delay = 0.5 * (2 ** attempt)
+                    jitter = base_delay * 0.1 * (0.5 - asyncio.get_running_loop().time() % 1)
+                    await asyncio.sleep(base_delay + jitter)
        
        raise RuntimeError("Failed to create valid connection")

@@ -505,42 +595,85 @@ class DorisConnectionManager:
                self.logger.error(f"Health check error: {e}")

    async def _perform_health_check(self):
-        """Perform enhanced health check"""
+        """Perform enhanced health check with aggressive stale connection detection"""
        try:
            unhealthy_sessions = []
+            stale_sessions = []
+            current_time = datetime.utcnow()
            
            # Enhanced health check with comprehensive validation
            for session_id, conn in self.session_connections.items():
+                try:
+                    # Check 1: Basic connection health
                    if not await self._comprehensive_connection_health_check(conn):
                        unhealthy_sessions.append(session_id)
+                        self.logger.debug(f"Session {session_id} marked as unhealthy")
+                        continue
                    
-            # Check for stale connections (over 30 minutes old)
-            current_time = datetime.utcnow()
-            stale_sessions = []
-            for session_id, conn in self.session_connections.items():
-                if session_id not in unhealthy_sessions:  # Don't double-check
-                    last_used_delta = (current_time - conn.last_used).total_seconds()
-                    if last_used_delta > 1800:  # 30 minutes
-                        # Force a comprehensive health check for stale connections
-                        if not await self._comprehensive_connection_health_check(conn):
+                    # Check 2: Stale connection detection (much more aggressive)
+                    time_since_last_use = (current_time - conn.last_used).total_seconds()
+                    connection_age = (current_time - conn.created_at).total_seconds()
+                    
+                    # Mark as stale if:
+                    # 1. Last used more than 15 minutes ago, OR
+                    # 2. Connection age exceeds maximum age, OR  
+                    # 3. Connection hasn't been used in a while and is old
+                    if (time_since_last_use > self.stale_connection_threshold or
+                        connection_age > self.max_connection_age or
+                        (time_since_last_use > 300 and connection_age > 1800)):  # 5 min unused + 30 min old
+                        
+                        # For stale connections, do an extra validation
+                        try:
+                            # Try a more aggressive ping test
+                            async with conn.connection.cursor() as cursor:
+                                await asyncio.wait_for(cursor.execute("SELECT 1"), timeout=3)
+                                await asyncio.wait_for(cursor.fetchone(), timeout=3)
+                            # If we get here, connection is actually healthy despite being stale
+                            self.logger.debug(f"Stale connection {session_id} passed extra validation")
+                        except Exception as stale_test_error:
                            stale_sessions.append(session_id)
+                            self.logger.debug(f"Session {session_id} marked as stale: {stale_test_error}")
+                            continue
+                    
+                except Exception as check_error:
+                    # If we can't even check the connection, it's definitely problematic
+                    self.logger.warning(f"Health check failed for session {session_id}: {check_error}")
+                    unhealthy_sessions.append(session_id)
            
            all_problematic_sessions = list(set(unhealthy_sessions + stale_sessions))
            
            # Clean up problematic connections
+            cleanup_results = {"success": 0, "failed": 0}
            for session_id in all_problematic_sessions:
+                try:
                    await self._cleanup_session_connection(session_id)
+                    cleanup_results["success"] += 1
                    self.metrics.failed_connections += 1
+                except Exception as cleanup_error:
+                    cleanup_results["failed"] += 1
+                    self.logger.error(f"Failed to cleanup session {session_id}: {cleanup_error}")
            
            # Update metrics
            await self._update_connection_metrics()
            self.metrics.last_health_check = datetime.utcnow()
            
+            # Log results
            if all_problematic_sessions:
-                self.logger.warning(f"Health check: cleaned up {len(unhealthy_sessions)} unhealthy and {len(stale_sessions)} stale connections")
+                self.logger.warning(
+                    f"Health check: cleaned up {len(unhealthy_sessions)} unhealthy and "
+                    f"{len(stale_sessions)} stale connections "
+                    f"(success: {cleanup_results['success']}, failed: {cleanup_results['failed']})"
+                )
            else:
                self.logger.debug(f"Health check: all {len(self.session_connections)} connections healthy")
            
+            # If we have a lot of connection failures, log some diagnostic info
+            if self.metrics.connection_errors > 50:  # Threshold for diagnostic logging
+                self.logger.warning(
+                    f"High connection error count detected: {self.metrics.connection_errors}. "
+                    f"This may indicate persistent connectivity issues with the database."
+                )
+
        except Exception as e:
            self.logger.error(f"Health check failed: {e}")
            # If health check fails, try to diagnose the issue
@@ -551,10 +684,11 @@ class DorisConnectionManager:
                pass  # Don't let diagnosis failure crash health check

    async def _cleanup_loop(self):
-        """Background cleanup loop"""
+        """Background cleanup loop with more frequent execution"""
        while True:
            try:
-                await asyncio.sleep(300)  # Run every 5 minutes
+                # Run cleanup more frequently - every 2 minutes instead of 5
+                await asyncio.sleep(120)  # Run every 2 minutes
                await self._cleanup_idle_connections()
            except asyncio.CancelledError:
                break
@@ -562,22 +696,69 @@ class DorisConnectionManager:
                self.logger.error(f"Cleanup loop error: {e}")

    async def _cleanup_idle_connections(self):
-        """Clean up idle connections"""
+        """Clean up idle connections with more aggressive criteria"""
        current_time = datetime.utcnow()
        idle_sessions = []
        
        for session_id, conn in self.session_connections.items():
-            # Check if connection has exceeded maximum age
-            age = (current_time - conn.created_at).total_seconds()
-            if age > self.max_connection_age:
+            try:
+                # Enhanced idle connection detection
+                connection_age = (current_time - conn.created_at).total_seconds()
+                time_since_last_use = (current_time - conn.last_used).total_seconds()
+                
+                # Mark as idle if:
+                # 1. Connection has exceeded maximum age, OR
+                # 2. Connection hasn't been used for more than 20 minutes, OR
+                # 3. Connection is old and hasn't been used recently
+                should_cleanup = (
+                    connection_age > self.max_connection_age or
+                    time_since_last_use > 1200 or  # 20 minutes unused
+                    (connection_age > 1800 and time_since_last_use > 600)  # 30 min old + 10 min unused
+                )
+                
+                if should_cleanup:
+                    # Before marking for cleanup, try a quick health check
+                    try:
+                        # Quick validation - if this fails, definitely cleanup
+                        if not conn.connection or conn.connection.closed:
+                            idle_sessions.append(session_id)
+                            continue
+                            
+                        # Quick ping test with timeout
+                        await asyncio.wait_for(conn.connection.ping(), timeout=2)
+                        
+                        # If ping succeeds but connection is still very old, cleanup anyway
+                        if connection_age > self.max_connection_age:
+                            idle_sessions.append(session_id)
+                            self.logger.debug(f"Cleaning up old but healthy connection for session {session_id}")
+                        else:
+                            self.logger.debug(f"Keeping healthy connection for session {session_id}")
+                            
+                    except Exception as health_error:
+                        # Health check failed, definitely cleanup
+                        idle_sessions.append(session_id)
+                        self.logger.debug(f"Cleanup marking session {session_id} due to health check failure: {health_error}")
+                        
+            except Exception as e:
+                self.logger.warning(f"Error checking connection {session_id} for cleanup: {e}")
+                # If we can't even check it, it's probably broken
                idle_sessions.append(session_id)
        
        # Clean up idle connections
+        cleanup_results = {"success": 0, "failed": 0}
        for session_id in idle_sessions:
+            try:
                await self._cleanup_session_connection(session_id)
+                cleanup_results["success"] += 1
+            except Exception as cleanup_error:
+                cleanup_results["failed"] += 1
+                self.logger.error(f"Failed to cleanup idle session {session_id}: {cleanup_error}")
        
        if idle_sessions:
-            self.logger.info(f"Cleaned up {len(idle_sessions)} idle connections")
+            self.logger.info(
+                f"Cleaned up {len(idle_sessions)} idle connections "
+                f"(success: {cleanup_results['success']}, failed: {cleanup_results['failed']})"
+            )

    async def _update_connection_metrics(self):
        """Update connection metrics"""