at_eof bug fix

2025-07-03 21:31:54 +08:00
parent 693c48d5ee
commit aa953e9fe1
1 changed files with 238 additions and 57 deletions
--- a/doris_mcp_server/utils/db.py
+++ b/doris_mcp_server/utils/db.py
@@ -70,6 +70,7 @@ class DorisConnection:
        self.query_count = 0
        self.is_healthy = True
        self.security_manager = security_manager
        self.logger = logging.getLogger(__name__)
    async def execute(self, sql: str, params: tuple | None = None, auth_context=None) -> QueryResult:
        """Execute SQL query"""
@@ -135,31 +136,88 @@ class DorisConnection:
            raise
    async def ping(self) -> bool:
-        """Check connection health status"""
+        """Check connection health status with enhanced at_eof error detection"""
        try:
-            # Check if connection exists and is not closed
+            # Check 1: Connection exists and is not closed
            if not self.connection or self.connection.closed:
                self.is_healthy = False
                return False
-            # Check if connection has _reader (aiomysql internal state)
+            # Check 2: Comprehensive internal state validation
-            # This prevents the 'NoneType' object has no attribute 'at_eof' error
+            # This is critical for detecting at_eof issues before they cause errors
            if not hasattr(self.connection, '_reader') or self.connection._reader is None:
                self.logger.debug(f"Connection {self.session_id} has invalid _reader state")
                self.is_healthy = False
                return False
-            # Additional check for reader's state
+            # Check 3: Verify transport state
-            if hasattr(self.connection._reader, '_transport') and self.connection._reader._transport is None:
+            if (hasattr(self.connection._reader, '_transport') and 
                self.connection._reader._transport is None):
                self.logger.debug(f"Connection {self.session_id} has invalid transport state")
                self.is_healthy = False
                return False
-            # Try to ping the connection
+            # Check 4: Additional stream state validation
-            await self.connection.ping()
+            if (hasattr(self.connection._reader, 'at_eof') and 
                callable(self.connection._reader.at_eof)):
                try:
                    # If the stream is already at EOF, the connection is broken
                    if self.connection._reader.at_eof():
                        self.logger.debug(f"Connection {self.session_id} reader is at EOF")
                        self.is_healthy = False
                        return False
                except Exception:
                    # If we can't even check at_eof, the connection is problematic
                    self.logger.debug(f"Connection {self.session_id} cannot check at_eof state")
                    self.is_healthy = False
                    return False
            # Check 5: Try to ping the connection with timeout
            try:
                await asyncio.wait_for(self.connection.ping(), timeout=5)
            except asyncio.TimeoutError:
                self.logger.debug(f"Connection {self.session_id} ping timeout")
                self.is_healthy = False
                return False
            except Exception as ping_error:
                # Check for specific error patterns
                error_str = str(ping_error).lower()
                if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
                    self.logger.debug(f"Connection {self.session_id} ping failed with connection state error: {ping_error}")
                else:
                    self.logger.debug(f"Connection {self.session_id} ping failed: {ping_error}")
                self.is_healthy = False
                return False
            # Check 6: Final validation with a simple query
            try:
                async with self.connection.cursor() as cursor:
                    await asyncio.wait_for(cursor.execute("SELECT 1"), timeout=3)
                    result = await asyncio.wait_for(cursor.fetchone(), timeout=3)
                    if not result or result[0] != 1:
                        self.logger.debug(f"Connection {self.session_id} test query returned invalid result")
                        self.is_healthy = False
                        return False
            except Exception as query_error:
                error_str = str(query_error).lower()
                if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
                    self.logger.debug(f"Connection {self.session_id} test query failed with connection state error: {query_error}")
                else:
                    self.logger.debug(f"Connection {self.session_id} test query failed: {query_error}")
                self.is_healthy = False
                return False
            # If all checks pass, the connection is healthy
            self.is_healthy = True
            return True
-        except (AttributeError, OSError, ConnectionError, Exception) as e:
+            
-            # Log the specific error for debugging
+        except Exception as e:
-            logging.debug(f"Connection ping failed for session {self.session_id}: {e}")
+            # Any uncaught exception means the connection is not healthy
            error_str = str(e).lower()
            if any(keyword in error_str for keyword in ['at_eof', 'nonetype', 'reader', 'transport']):
                self.logger.debug(f"Connection {self.session_id} ping failed with connection state error: {e}")
            else:
                self.logger.debug(f"Connection {self.session_id} ping failed with unexpected error: {e}")
            self.is_healthy = False
            return False
@@ -188,11 +246,15 @@ class DorisConnectionManager:
        self.logger = logging.getLogger(__name__)
        self.security_manager = security_manager
-        # Health check configuration
+        # Enhanced health check configuration for long-connection issues
-        self.health_check_interval = config.database.health_check_interval or 60
+        # Reduce health check interval to detect stale connections faster
        self.health_check_interval = min(config.database.health_check_interval or 60, 30)  # Max 30 seconds
        self.max_connection_age = config.database.max_connection_age or 3600
        self.connection_timeout = config.database.connection_timeout or 30
        # Add stale connection detection threshold (much shorter than MySQL's wait_timeout)
        self.stale_connection_threshold = 900  # 15 minutes - connections older than this are considered stale
        # Start background tasks
        self._health_check_task = None
        self._cleanup_task = None
@@ -210,8 +272,11 @@ class DorisConnectionManager:
            if not self.config.database.password:
                self.logger.warning("Database password is empty, this may cause connection issues")
-            # Create connection pool with improved stability parameters
+            # Create connection pool with aggressive connection recycling to prevent at_eof issues
-            # Key change: Set minsize=0 to avoid pre-creation issues that cause at_eof errors
+            # Key changes:
            # 1. Reduce pool_recycle to 30 minutes (1800 seconds) - much shorter than MySQL's wait_timeout
            # 2. Add shorter connect_timeout to fail fast on bad connections
            # 3. Enable autocommit to avoid transaction state issues
            self.pool = await aiomysql.create_pool(
                host=self.config.database.host,
                port=self.config.database.port,
@@ -220,12 +285,11 @@ class DorisConnectionManager:
                db=self.config.database.database,
                charset="utf8",
                minsize=self.config.database.min_connections,  # Always 0 per configuration to avoid at_eof issues
                maxsize=self.config.database.max_connections or 20,
                autocommit=True,
-                connect_timeout=self.connection_timeout,
+                connect_timeout=15,  # Shorter timeout to fail fast
-                # Enhanced stability parameters
+                # Aggressive connection recycling to prevent stale connections
-                pool_recycle=7200,  # Recycle connections every 2 hours
+                pool_recycle=1800,  # Recycle connections every 30 minutes instead of 2 hours
                echo=False,  # Don't echo SQL statements
            )
@@ -234,13 +298,12 @@ class DorisConnectionManager:
                raise RuntimeError("Connection pool robust test failed")
            self.logger.info(
-                f"Connection pool initialized successfully with on-demand connection creation, "
+                f"Connection pool initialized successfully with aggressive recycling (30min), "
                f"min connections: {self.config.database.min_connections}, "
                f"max connections: {self.config.database.max_connections or 20}"
            )
-            # Start background monitoring tasks
+            # Start background monitoring tasks with more frequent health checks
            self._health_check_task = asyncio.create_task(self._health_check_loop())
            self._cleanup_task = asyncio.create_task(self._cleanup_loop())
@@ -312,10 +375,25 @@ class DorisConnectionManager:
                    self.logger.warning(f"Pool returned closed connection (attempt {attempt + 1})")
                    continue
-                # Perform a simple ping test instead of checking internal state
+                # Enhanced connection validation with multiple checks
                # Internal state (_reader, _transport) might not be fully initialized yet
                try:
-                    # Test basic connectivity with a simple query
+                    # Check 1: Verify connection object internal state
                    if not hasattr(raw_connection, '_reader') or raw_connection._reader is None:
                        self.logger.warning(f"Connection has invalid _reader state (attempt {attempt + 1})")
                        await raw_connection.ensure_closed()
                        continue
                    # Check 2: Verify transport state
                    if (hasattr(raw_connection._reader, '_transport') and 
                        raw_connection._reader._transport is None):
                        self.logger.warning(f"Connection has invalid transport state (attempt {attempt + 1})")
                        await raw_connection.ensure_closed()
                        continue
                    # Check 3: Perform ping test to verify server-side connectivity
                    await raw_connection.ping()
                    # Check 4: Test with actual query execution
                    async with raw_connection.cursor() as cursor:
                        await cursor.execute("SELECT 1")
                        result = await cursor.fetchone()
@@ -328,17 +406,27 @@ class DorisConnectionManager:
                            continue
                except Exception as e:
-                    # Check if this is an at_eof error specifically
+                    # Enhanced error detection for connection issues
                    error_str = str(e).lower()
-                    if 'at_eof' in error_str or 'nonetype' in error_str:
+                    
-                        self.logger.warning(f"Connection has at_eof issue (attempt {attempt + 1}): {e}")
+                    # Check for various connection-related errors
                    connection_error_keywords = [
                        'at_eof', 'nonetype', 'connection', 'transport', 'reader', 
                        'lost connection', 'broken pipe', 'connection reset',
                        'timed out', 'connection refused', 'host unreachable'
                    ]
                    is_connection_error = any(keyword in error_str for keyword in connection_error_keywords)
                    if is_connection_error:
                        self.logger.warning(f"Connection validation failed with connection error (attempt {attempt + 1}): {e}")
                    else:
-                        self.logger.warning(f"Connection test failed (attempt {attempt + 1}): {e}")
+                        self.logger.warning(f"Connection validation failed (attempt {attempt + 1}): {e}")
                    try:
                        await raw_connection.ensure_closed()
                    except Exception:
-                        pass
+                        pass  # Ignore cleanup errors
                    continue
            except Exception as e:
@@ -346,8 +434,10 @@ class DorisConnectionManager:
                if attempt == max_retries - 1:
                    raise RuntimeError(f"Failed to create valid connection after {max_retries} attempts: {e}")
                else:
-                    # Exponential backoff
+                    # Exponential backoff with jitter to avoid thundering herd
-                    await asyncio.sleep(0.5 * (2 ** attempt))
+                    base_delay = 0.5 * (2 ** attempt)
                    jitter = base_delay * 0.1 * (0.5 - asyncio.get_running_loop().time() % 1)
                    await asyncio.sleep(base_delay + jitter)
        raise RuntimeError("Failed to create valid connection")
@@ -505,42 +595,85 @@ class DorisConnectionManager:
                self.logger.error(f"Health check error: {e}")
    async def _perform_health_check(self):
-        """Perform enhanced health check"""
+        """Perform enhanced health check with aggressive stale connection detection"""
        try:
            unhealthy_sessions = []
            stale_sessions = []
            current_time = datetime.utcnow()
            # Enhanced health check with comprehensive validation
            for session_id, conn in self.session_connections.items():
                try:
                    # Check 1: Basic connection health
                    if not await self._comprehensive_connection_health_check(conn):
                        unhealthy_sessions.append(session_id)
                        self.logger.debug(f"Session {session_id} marked as unhealthy")
                        continue
-            # Check for stale connections (over 30 minutes old)
+                    # Check 2: Stale connection detection (much more aggressive)
-            current_time = datetime.utcnow()
+                    time_since_last_use = (current_time - conn.last_used).total_seconds()
-            stale_sessions = []
+                    connection_age = (current_time - conn.created_at).total_seconds()
-            for session_id, conn in self.session_connections.items():
+                    
-                if session_id not in unhealthy_sessions:  # Don't double-check
+                    # Mark as stale if:
-                    last_used_delta = (current_time - conn.last_used).total_seconds()
+                    # 1. Last used more than 15 minutes ago, OR
-                    if last_used_delta > 1800:  # 30 minutes
+                    # 2. Connection age exceeds maximum age, OR  
-                        # Force a comprehensive health check for stale connections
+                    # 3. Connection hasn't been used in a while and is old
-                        if not await self._comprehensive_connection_health_check(conn):
+                    if (time_since_last_use > self.stale_connection_threshold or
                        connection_age > self.max_connection_age or
                        (time_since_last_use > 300 and connection_age > 1800)):  # 5 min unused + 30 min old
                        # For stale connections, do an extra validation
                        try:
                            # Try a more aggressive ping test
                            async with conn.connection.cursor() as cursor:
                                await asyncio.wait_for(cursor.execute("SELECT 1"), timeout=3)
                                await asyncio.wait_for(cursor.fetchone(), timeout=3)
                            # If we get here, connection is actually healthy despite being stale
                            self.logger.debug(f"Stale connection {session_id} passed extra validation")
                        except Exception as stale_test_error:
                            stale_sessions.append(session_id)
                            self.logger.debug(f"Session {session_id} marked as stale: {stale_test_error}")
                            continue
                except Exception as check_error:
                    # If we can't even check the connection, it's definitely problematic
                    self.logger.warning(f"Health check failed for session {session_id}: {check_error}")
                    unhealthy_sessions.append(session_id)
            all_problematic_sessions = list(set(unhealthy_sessions + stale_sessions))
            # Clean up problematic connections
            cleanup_results = {"success": 0, "failed": 0}
            for session_id in all_problematic_sessions:
                try:
                    await self._cleanup_session_connection(session_id)
                    cleanup_results["success"] += 1
                    self.metrics.failed_connections += 1
                except Exception as cleanup_error:
                    cleanup_results["failed"] += 1
                    self.logger.error(f"Failed to cleanup session {session_id}: {cleanup_error}")
            # Update metrics
            await self._update_connection_metrics()
            self.metrics.last_health_check = datetime.utcnow()
            # Log results
            if all_problematic_sessions:
-                self.logger.warning(f"Health check: cleaned up {len(unhealthy_sessions)} unhealthy and {len(stale_sessions)} stale connections")
+                self.logger.warning(
                    f"Health check: cleaned up {len(unhealthy_sessions)} unhealthy and "
                    f"{len(stale_sessions)} stale connections "
                    f"(success: {cleanup_results['success']}, failed: {cleanup_results['failed']})"
                )
            else:
                self.logger.debug(f"Health check: all {len(self.session_connections)} connections healthy")
            # If we have a lot of connection failures, log some diagnostic info
            if self.metrics.connection_errors > 50:  # Threshold for diagnostic logging
                self.logger.warning(
                    f"High connection error count detected: {self.metrics.connection_errors}. "
                    f"This may indicate persistent connectivity issues with the database."
                )
        except Exception as e:
            self.logger.error(f"Health check failed: {e}")
            # If health check fails, try to diagnose the issue
@@ -551,10 +684,11 @@ class DorisConnectionManager:
                pass  # Don't let diagnosis failure crash health check
    async def _cleanup_loop(self):
-        """Background cleanup loop"""
+        """Background cleanup loop with more frequent execution"""
        while True:
            try:
-                await asyncio.sleep(300)  # Run every 5 minutes
+                # Run cleanup more frequently - every 2 minutes instead of 5
                await asyncio.sleep(120)  # Run every 2 minutes
                await self._cleanup_idle_connections()
            except asyncio.CancelledError:
                break
@@ -562,22 +696,69 @@ class DorisConnectionManager:
                self.logger.error(f"Cleanup loop error: {e}")
    async def _cleanup_idle_connections(self):
-        """Clean up idle connections"""
+        """Clean up idle connections with more aggressive criteria"""
        current_time = datetime.utcnow()
        idle_sessions = []
        for session_id, conn in self.session_connections.items():
-            # Check if connection has exceeded maximum age
+            try:
-            age = (current_time - conn.created_at).total_seconds()
+                # Enhanced idle connection detection
-            if age > self.max_connection_age:
+                connection_age = (current_time - conn.created_at).total_seconds()
                time_since_last_use = (current_time - conn.last_used).total_seconds()
                # Mark as idle if:
                # 1. Connection has exceeded maximum age, OR
                # 2. Connection hasn't been used for more than 20 minutes, OR
                # 3. Connection is old and hasn't been used recently
                should_cleanup = (
                    connection_age > self.max_connection_age or
                    time_since_last_use > 1200 or  # 20 minutes unused
                    (connection_age > 1800 and time_since_last_use > 600)  # 30 min old + 10 min unused
                )
                if should_cleanup:
                    # Before marking for cleanup, try a quick health check
                    try:
                        # Quick validation - if this fails, definitely cleanup
                        if not conn.connection or conn.connection.closed:
                            idle_sessions.append(session_id)
                            continue
                        # Quick ping test with timeout
                        await asyncio.wait_for(conn.connection.ping(), timeout=2)
                        # If ping succeeds but connection is still very old, cleanup anyway
                        if connection_age > self.max_connection_age:
                            idle_sessions.append(session_id)
                            self.logger.debug(f"Cleaning up old but healthy connection for session {session_id}")
                        else:
                            self.logger.debug(f"Keeping healthy connection for session {session_id}")
                    except Exception as health_error:
                        # Health check failed, definitely cleanup
                        idle_sessions.append(session_id)
                        self.logger.debug(f"Cleanup marking session {session_id} due to health check failure: {health_error}")
            except Exception as e:
                self.logger.warning(f"Error checking connection {session_id} for cleanup: {e}")
                # If we can't even check it, it's probably broken
                idle_sessions.append(session_id)
        # Clean up idle connections
        cleanup_results = {"success": 0, "failed": 0}
        for session_id in idle_sessions:
            try:
                await self._cleanup_session_connection(session_id)
                cleanup_results["success"] += 1
            except Exception as cleanup_error:
                cleanup_results["failed"] += 1
                self.logger.error(f"Failed to cleanup idle session {session_id}: {cleanup_error}")
        if idle_sessions:
-            self.logger.info(f"Cleaned up {len(idle_sessions)} idle connections")
+            self.logger.info(
                f"Cleaned up {len(idle_sessions)} idle connections "
                f"(success: {cleanup_results['success']}, failed: {cleanup_results['failed']})"
            )
    async def _update_connection_metrics(self):
        """Update connection metrics"""