[BUG]Optimize and fix the capabilities of 0.5.0 tools (#26)

1. **Unified Naming for CLI Arguments and Environment Variables** 
- All database-related CLI arguments now use the `--doris-*` prefix, and environment variables use `DORIS_*` for consistency and maintainability. 
- Backward compatibility: old `--db-*` arguments are still supported.

2. **Automatic Filtering of System SQL in Slow Query TopN** 
- Slow query analysis now automatically excludes SQL statements involving `__internal_schema`, `information_schema`, and `mysql` system databases, ensuring only business-related slow queries are counted. 
- Filtering is performed at the SQL level using `NOT LIKE` and `state != 'ERR'` for efficiency and safety.

3. **Unified Query Timeout Configuration** 
- If no `timeout` is specified for query execution, the system will use the `config.performance.query_timeout` value as the default, falling back to 30 seconds if not configured.
- This avoids hardcoding and makes timeout management more flexible.

4. **Tool execution optimization**
- Significantly reduce the execution time of some data governance and operation and maintenance tools
- Optimize execution logic and reduce data scanning
- Enable concurrent scanning to speed up retrieval

5. **Log system optimization**
- Fix the Console log printing logic and output the log content correctly
- Add advanced tool execution process log output to facilitate further positioning of error locations

6. **DB Connection optimization**
- Fixed a connection pool acquisition exception caused by deadlock

7. **Other Improvements**
- Help documentation and CLI examples updated to reflect new and legacy parameter compatibility.
- Code comments and documentation further standardized for better team collaboration and open-source community understanding.
This commit is contained in:
Yijia Su
2025-07-14 19:04:11 +08:00
committed by GitHub
parent 54572d0861
commit 651d524814
8 changed files with 2479 additions and 1467 deletions

View File

@@ -61,7 +61,7 @@ class DorisToolsManager:
# Initialize v0.5.0 advanced analytics tools
self.data_governance_tools = DataGovernanceTools(connection_manager)
self.data_exploration_tools = DataExplorationTools(connection_manager)
self.data_quality_tools = DataQualityTools(connection_manager)
self.data_quality_tools = DataQualityTools(connection_manager, connection_manager.config)
self.security_analytics_tools = SecurityAnalyticsTools(connection_manager)
self.dependency_analysis_tools = DependencyAnalysisTools(connection_manager)
self.performance_analytics_tools = PerformanceAnalyticsTools(connection_manager)
@@ -464,41 +464,87 @@ class DorisToolsManager:
# 🔄 Unified Data Quality Analysis Tool (New in v0.5.0)
@mcp.tool(
"analyze_data_quality",
description="""[Function Description]: Comprehensive data quality analysis combining completeness and distribution analysis.
"get_table_basic_info",
description="""[Function Description]: Get basic information about a table including row count, column count, partitions, and size.
[Parameter Content]:
- table_name (string) [Required] - Name of the table to analyze
- analysis_scope (string) [Optional] - Analysis scope, default is "comprehensive"
* "completeness": Only completeness analysis (null rates, business rules)
* "distribution": Only distribution analysis (statistical patterns)
* "comprehensive": Full analysis including both completeness and distribution
- catalog_name (string) [Optional] - Target catalog name
- db_name (string) [Optional] - Target database name
""",
)
async def get_table_basic_info_tool(
table_name: str,
catalog_name: str = None,
db_name: str = None
) -> str:
"""Get table basic information"""
return await self.call_tool("get_table_basic_info", {
"table_name": table_name,
"catalog_name": catalog_name,
"db_name": db_name
})
@mcp.tool(
"analyze_columns",
description="""[Function Description]: Analyze completeness and distribution of specified columns in a table.
[Parameter Content]:
- table_name (string) [Required] - Name of the table to analyze
- columns (array) [Required] - List of column names to analyze
- analysis_types (array) [Optional] - Types of analysis to perform, default is ["both"]
* "completeness": Only completeness analysis (null rates, non-null counts)
* "distribution": Only distribution analysis (statistical patterns by data type)
* "both": Both completeness and distribution analysis
- sample_size (integer) [Optional] - Maximum number of rows to sample, default is 100000
- include_all_columns (boolean) [Optional] - Whether to analyze all columns, default is false
- business_rules (array) [Optional] - Business rule validations in format [{"rule_name": "email_format", "sql_condition": "email REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'"}]
- catalog_name (string) [Optional] - Target catalog name
- db_name (string) [Optional] - Target database name
- detailed_response (boolean) [Optional] - Whether to return detailed response including raw data, default is false
""",
)
async def analyze_data_quality_tool(
async def analyze_columns_tool(
table_name: str,
analysis_scope: str = "comprehensive",
columns: List[str],
analysis_types: List[str] = None,
sample_size: int = 100000,
include_all_columns: bool = False,
business_rules: List[dict] = None,
catalog_name: str = None,
db_name: str = None,
detailed_response: bool = False
) -> str:
"""Unified data quality analysis tool"""
return await self.call_tool("analyze_data_quality", {
"""Analyze table columns"""
return await self.call_tool("analyze_columns", {
"table_name": table_name,
"analysis_scope": analysis_scope,
"columns": columns,
"analysis_types": analysis_types or ["both"],
"sample_size": sample_size,
"include_all_columns": include_all_columns,
"business_rules": business_rules,
"catalog_name": catalog_name,
"db_name": db_name,
"detailed_response": detailed_response
})
@mcp.tool(
"analyze_table_storage",
description="""[Function Description]: Analyze table's physical distribution and storage information.
[Parameter Content]:
- table_name (string) [Required] - Name of the table to analyze
- catalog_name (string) [Optional] - Target catalog name
- db_name (string) [Optional] - Target database name
- detailed_response (boolean) [Optional] - Whether to return detailed response including raw data, default is false
""",
)
async def analyze_table_storage_tool(
table_name: str,
catalog_name: str = None,
db_name: str = None,
detailed_response: bool = False
) -> str:
"""Analyze table storage"""
return await self.call_tool("analyze_table_storage", {
"table_name": table_name,
"catalog_name": catalog_name,
"db_name": db_name,
"detailed_response": detailed_response
@@ -721,7 +767,7 @@ No parameters required. Returns connection status, configuration, and diagnostic
"""Get ADBC connection information and status"""
return await self.call_tool("get_adbc_connection_info", {})
logger.info("Successfully registered 23 tools to MCP server (14 basic + 7 advanced analytics + 2 ADBC tools)")
logger.info("Successfully registered 25 tools to MCP server (14 basic + 9 advanced analytics + 2 ADBC tools)")
async def list_tools(self) -> List[Tool]:
"""List all available query tools (for stdio mode)"""
@@ -1064,20 +1110,14 @@ No parameters required. Returns connection status, configuration, and diagnostic
},
),
# ==================== v0.5.0 Advanced Analytics Tools ====================
# Atomic Data Quality Analysis Tools
Tool(
name="analyze_data_quality",
description="""[Function Description]: Comprehensive data quality analysis combining completeness and distribution analysis.
name="get_table_basic_info",
description="""[Function Description]: Get basic information about a table including row count, column count, partitions, and size.
[Parameter Content]:
- table_name (string) [Required] - Name of the table to analyze
- analysis_scope (string) [Optional] - Analysis scope, default is "comprehensive"
* "completeness": Only completeness analysis (null rates, business rules)
* "distribution": Only distribution analysis (statistical patterns)
* "comprehensive": Full analysis including both completeness and distribution
- sample_size (integer) [Optional] - Maximum number of rows to sample, default is 100000
- include_all_columns (boolean) [Optional] - Whether to analyze all columns, default is false
- business_rules (array) [Optional] - Business rule validations in format [{"rule_name": "email_format", "sql_condition": "email REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'"}]
- catalog_name (string) [Optional] - Target catalog name
- db_name (string) [Optional] - Target database name
""",
@@ -1085,10 +1125,58 @@ No parameters required. Returns connection status, configuration, and diagnostic
"type": "object",
"properties": {
"table_name": {"type": "string", "description": "Name of the table to analyze"},
"analysis_scope": {"type": "string", "enum": ["completeness", "distribution", "comprehensive"], "description": "Analysis scope", "default": "comprehensive"},
"catalog_name": {"type": "string", "description": "Target catalog name"},
"db_name": {"type": "string", "description": "Target database name"},
},
"required": ["table_name"],
},
),
Tool(
name="analyze_columns",
description="""[Function Description]: Analyze completeness and distribution of specified columns in a table.
[Parameter Content]:
- table_name (string) [Required] - Name of the table to analyze
- columns (array) [Required] - List of column names to analyze
- analysis_types (array) [Optional] - Types of analysis to perform, default is ["both"]
* "completeness": Only completeness analysis (null rates, non-null counts)
* "distribution": Only distribution analysis (statistical patterns by data type)
* "both": Both completeness and distribution analysis
- sample_size (integer) [Optional] - Maximum number of rows to sample, default is 100000
- catalog_name (string) [Optional] - Target catalog name
- db_name (string) [Optional] - Target database name
- detailed_response (boolean) [Optional] - Whether to return detailed response including raw data, default is false
""",
inputSchema={
"type": "object",
"properties": {
"table_name": {"type": "string", "description": "Name of the table to analyze"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "List of column names to analyze"},
"analysis_types": {"type": "array", "items": {"type": "string", "enum": ["completeness", "distribution", "both"]}, "description": "Types of analysis to perform", "default": ["both"]},
"sample_size": {"type": "integer", "description": "Maximum number of rows to sample", "default": 100000},
"include_all_columns": {"type": "boolean", "description": "Whether to analyze all columns", "default": False},
"business_rules": {"type": "array", "items": {"type": "object"}, "description": "Business rule validations"},
"catalog_name": {"type": "string", "description": "Target catalog name"},
"db_name": {"type": "string", "description": "Target database name"},
"detailed_response": {"type": "boolean", "description": "Whether to return detailed response including raw data", "default": False},
},
"required": ["table_name", "columns"],
},
),
Tool(
name="analyze_table_storage",
description="""[Function Description]: Analyze table's physical distribution and storage information.
[Parameter Content]:
- table_name (string) [Required] - Name of the table to analyze
- catalog_name (string) [Optional] - Target catalog name
- db_name (string) [Optional] - Target database name
- detailed_response (boolean) [Optional] - Whether to return detailed response including raw data, default is false
""",
inputSchema={
"type": "object",
"properties": {
"table_name": {"type": "string", "description": "Name of the table to analyze"},
"catalog_name": {"type": "string", "description": "Target catalog name"},
"db_name": {"type": "string", "description": "Target database name"},
"detailed_response": {"type": "boolean", "description": "Whether to return detailed response including raw data", "default": False},
@@ -1096,7 +1184,6 @@ No parameters required. Returns connection status, configuration, and diagnostic
"required": ["table_name"],
},
),
Tool(
name="trace_column_lineage",
description="""[Function Description]: Trace data lineage for specified columns through SQL analysis and dependency mapping.
@@ -1323,9 +1410,13 @@ No parameters required. Returns connection status, configuration, and diagnostic
elif name == "get_historical_memory_stats":
arguments["data_type"] = "historical"
result = await self._get_memory_stats_tool(arguments)
# v0.5.0 Advanced Analytics Tools
elif name == "analyze_data_quality":
result = await self._analyze_data_quality_tool(arguments)
# v0.5.0 Advanced Analytics Tools - Atomic Data Quality Tools
elif name == "get_table_basic_info":
result = await self._get_table_basic_info_tool(arguments)
elif name == "analyze_columns":
result = await self._analyze_columns_tool(arguments)
elif name == "analyze_table_storage":
result = await self._analyze_table_storage_tool(arguments)
elif name == "trace_column_lineage":
result = await self._trace_column_lineage_tool(arguments)
elif name == "monitor_data_freshness":
@@ -1595,26 +1686,46 @@ No parameters required. Returns connection status, configuration, and diagnostic
# ==================== v0.5.0 Advanced Analytics Tools Private Methods ====================
async def _analyze_data_quality_tool(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Unified data quality analysis tool routing"""
async def _get_table_basic_info_tool(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get table basic information tool routing"""
try:
# Extract parameters
table_name = arguments.get("table_name")
analysis_scope = arguments.get("analysis_scope", "comprehensive")
catalog_name = arguments.get("catalog_name")
db_name = arguments.get("db_name")
# Delegate to atomic data quality tools
result = await self.data_quality_tools.get_table_basic_info(
table_name=table_name,
catalog_name=catalog_name,
db_name=db_name
)
return result
except Exception as e:
return {
"error": str(e),
"analysis_type": "table_basic_info",
"timestamp": datetime.now().isoformat()
}
async def _analyze_columns_tool(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze columns tool routing"""
try:
table_name = arguments.get("table_name")
columns = arguments.get("columns")
analysis_types = arguments.get("analysis_types", ["both"])
sample_size = arguments.get("sample_size", 100000)
include_all_columns = arguments.get("include_all_columns", False)
business_rules = arguments.get("business_rules", [])
catalog_name = arguments.get("catalog_name")
db_name = arguments.get("db_name")
detailed_response = arguments.get("detailed_response", False)
# Delegate to the unified data quality tools
result = await self.data_quality_tools.analyze_data_quality(
# Delegate to atomic data quality tools
result = await self.data_quality_tools.analyze_columns(
table_name=table_name,
analysis_scope=analysis_scope,
columns=columns,
analysis_types=analysis_types,
sample_size=sample_size,
include_all_columns=include_all_columns,
business_rules=business_rules,
catalog_name=catalog_name,
db_name=db_name,
detailed_response=detailed_response
@@ -1625,7 +1736,32 @@ No parameters required. Returns connection status, configuration, and diagnostic
except Exception as e:
return {
"error": str(e),
"analysis_type": "unified_data_quality",
"analysis_type": "columns_analysis",
"timestamp": datetime.now().isoformat()
}
async def _analyze_table_storage_tool(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze table storage tool routing"""
try:
table_name = arguments.get("table_name")
catalog_name = arguments.get("catalog_name")
db_name = arguments.get("db_name")
detailed_response = arguments.get("detailed_response", False)
# Delegate to atomic data quality tools
result = await self.data_quality_tools.analyze_table_storage(
table_name=table_name,
catalog_name=catalog_name,
db_name=db_name,
detailed_response=detailed_response
)
return result
except Exception as e:
return {
"error": str(e),
"analysis_type": "table_storage_analysis",
"timestamp": datetime.now().isoformat()
}