[BUG]Optimize and fix the capabilities of 0.5.0 tools (#26)
1. **Unified Naming for CLI Arguments and Environment Variables** - All database-related CLI arguments now use the `--doris-*` prefix, and environment variables use `DORIS_*` for consistency and maintainability. - Backward compatibility: old `--db-*` arguments are still supported. 2. **Automatic Filtering of System SQL in Slow Query TopN** - Slow query analysis now automatically excludes SQL statements involving `__internal_schema`, `information_schema`, and `mysql` system databases, ensuring only business-related slow queries are counted. - Filtering is performed at the SQL level using `NOT LIKE` and `state != 'ERR'` for efficiency and safety. 3. **Unified Query Timeout Configuration** - If no `timeout` is specified for query execution, the system will use the `config.performance.query_timeout` value as the default, falling back to 30 seconds if not configured. - This avoids hardcoding and makes timeout management more flexible. 4. **Tool execution optimization** - Significantly reduce the execution time of some data governance and operation and maintenance tools - Optimize execution logic and reduce data scanning - Enable concurrent scanning to speed up retrieval 5. **Log system optimization** - Fix the Console log printing logic and output the log content correctly - Add advanced tool execution process log output to facilitate further positioning of error locations 6. **DB Connection optimization** - Fixed a connection pool acquisition exception caused by deadlock 7. **Other Improvements** - Help documentation and CLI examples updated to reflect new and legacy parameter compatibility. - Code comments and documentation further standardized for better team collaboration and open-source community understanding.
This commit is contained in:
@@ -137,6 +137,33 @@ class PerformanceConfig:
|
||||
max_response_content_size: int = 4096
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataQualityConfig:
|
||||
"""Data quality analysis configuration"""
|
||||
|
||||
# Column analysis configuration
|
||||
max_columns_per_batch: int = 20 # Maximum columns to analyze in a single batch
|
||||
default_sample_size: int = 100000 # Default sample size for analysis
|
||||
|
||||
# Sampling strategy configuration
|
||||
small_table_threshold: int = 100000 # Tables smaller than this use full table analysis
|
||||
medium_table_threshold: int = 1000000 # Tables smaller than this use simple LIMIT sampling
|
||||
# Tables larger than medium_table_threshold use systematic sampling
|
||||
|
||||
# Performance optimization
|
||||
enable_batch_analysis: bool = True # Enable batch analysis for multiple columns
|
||||
batch_timeout: int = 300 # Timeout for batch analysis in seconds
|
||||
|
||||
# Accuracy vs Performance trade-off
|
||||
enable_fast_mode: bool = False # Use approximate algorithms for faster results
|
||||
fast_mode_sample_size: int = 10000 # Sample size for fast mode
|
||||
|
||||
# Statistical analysis configuration
|
||||
enable_distribution_analysis: bool = True # Enable distribution analysis
|
||||
histogram_bins: int = 20 # Number of bins for histogram analysis
|
||||
percentile_levels: list[float] = field(default_factory=lambda: [0.25, 0.5, 0.75, 0.95, 0.99]) # Percentile levels to calculate
|
||||
|
||||
|
||||
@dataclass
|
||||
class ADBCConfig:
|
||||
"""ADBC (Arrow Flight SQL) configuration"""
|
||||
@@ -208,6 +235,7 @@ class DorisConfig:
|
||||
database: DatabaseConfig = field(default_factory=DatabaseConfig)
|
||||
security: SecurityConfig = field(default_factory=SecurityConfig)
|
||||
performance: PerformanceConfig = field(default_factory=PerformanceConfig)
|
||||
data_quality: DataQualityConfig = field(default_factory=DataQualityConfig)
|
||||
logging: LoggingConfig = field(default_factory=LoggingConfig)
|
||||
monitoring: MonitoringConfig = field(default_factory=MonitoringConfig)
|
||||
adbc: ADBCConfig = field(default_factory=ADBCConfig)
|
||||
@@ -404,6 +432,38 @@ class DorisConfig:
|
||||
os.getenv("ADBC_ENABLED", str(config.adbc.enabled).lower()).lower() == "true"
|
||||
)
|
||||
|
||||
# Data quality configuration
|
||||
config.data_quality.max_columns_per_batch = int(
|
||||
os.getenv("DATA_QUALITY_MAX_COLUMNS_PER_BATCH", str(config.data_quality.max_columns_per_batch))
|
||||
)
|
||||
config.data_quality.default_sample_size = int(
|
||||
os.getenv("DATA_QUALITY_DEFAULT_SAMPLE_SIZE", str(config.data_quality.default_sample_size))
|
||||
)
|
||||
config.data_quality.small_table_threshold = int(
|
||||
os.getenv("DATA_QUALITY_SMALL_TABLE_THRESHOLD", str(config.data_quality.small_table_threshold))
|
||||
)
|
||||
config.data_quality.medium_table_threshold = int(
|
||||
os.getenv("DATA_QUALITY_MEDIUM_TABLE_THRESHOLD", str(config.data_quality.medium_table_threshold))
|
||||
)
|
||||
config.data_quality.enable_batch_analysis = (
|
||||
os.getenv("DATA_QUALITY_ENABLE_BATCH_ANALYSIS", str(config.data_quality.enable_batch_analysis).lower()).lower() == "true"
|
||||
)
|
||||
config.data_quality.batch_timeout = int(
|
||||
os.getenv("DATA_QUALITY_BATCH_TIMEOUT", str(config.data_quality.batch_timeout))
|
||||
)
|
||||
config.data_quality.enable_fast_mode = (
|
||||
os.getenv("DATA_QUALITY_ENABLE_FAST_MODE", str(config.data_quality.enable_fast_mode).lower()).lower() == "true"
|
||||
)
|
||||
config.data_quality.fast_mode_sample_size = int(
|
||||
os.getenv("DATA_QUALITY_FAST_MODE_SAMPLE_SIZE", str(config.data_quality.fast_mode_sample_size))
|
||||
)
|
||||
config.data_quality.enable_distribution_analysis = (
|
||||
os.getenv("DATA_QUALITY_ENABLE_DISTRIBUTION_ANALYSIS", str(config.data_quality.enable_distribution_analysis).lower()).lower() == "true"
|
||||
)
|
||||
config.data_quality.histogram_bins = int(
|
||||
os.getenv("DATA_QUALITY_HISTOGRAM_BINS", str(config.data_quality.histogram_bins))
|
||||
)
|
||||
|
||||
# Server configuration
|
||||
config.server_name = os.getenv("SERVER_NAME", config.server_name)
|
||||
config.server_version = os.getenv("SERVER_VERSION", config.server_version)
|
||||
@@ -443,6 +503,13 @@ class DorisConfig:
|
||||
if hasattr(config.performance, key):
|
||||
setattr(config.performance, key, value)
|
||||
|
||||
# Update data quality configuration
|
||||
if "data_quality" in config_data:
|
||||
dq_config = config_data["data_quality"]
|
||||
for key, value in dq_config.items():
|
||||
if hasattr(config.data_quality, key):
|
||||
setattr(config.data_quality, key, value)
|
||||
|
||||
# Update logging configuration
|
||||
if "logging" in config_data:
|
||||
log_config = config_data["logging"]
|
||||
@@ -516,6 +583,19 @@ class DorisConfig:
|
||||
"idle_timeout": self.performance.idle_timeout,
|
||||
"max_response_content_size": self.performance.max_response_content_size,
|
||||
},
|
||||
"data_quality": {
|
||||
"max_columns_per_batch": self.data_quality.max_columns_per_batch,
|
||||
"default_sample_size": self.data_quality.default_sample_size,
|
||||
"small_table_threshold": self.data_quality.small_table_threshold,
|
||||
"medium_table_threshold": self.data_quality.medium_table_threshold,
|
||||
"enable_batch_analysis": self.data_quality.enable_batch_analysis,
|
||||
"batch_timeout": self.data_quality.batch_timeout,
|
||||
"enable_fast_mode": self.data_quality.enable_fast_mode,
|
||||
"fast_mode_sample_size": self.data_quality.fast_mode_sample_size,
|
||||
"enable_distribution_analysis": self.data_quality.enable_distribution_analysis,
|
||||
"histogram_bins": self.data_quality.histogram_bins,
|
||||
"percentile_levels": self.data_quality.percentile_levels,
|
||||
},
|
||||
"logging": {
|
||||
"level": self.logging.level,
|
||||
"format": self.logging.format,
|
||||
@@ -602,6 +682,31 @@ class DorisConfig:
|
||||
if self.performance.query_timeout <= 0:
|
||||
errors.append("Query timeout must be greater than 0")
|
||||
|
||||
# Validate data quality configuration
|
||||
if self.data_quality.max_columns_per_batch <= 0:
|
||||
errors.append("Max columns per batch must be greater than 0")
|
||||
|
||||
if self.data_quality.default_sample_size <= 0:
|
||||
errors.append("Default sample size must be greater than 0")
|
||||
|
||||
if self.data_quality.small_table_threshold <= 0:
|
||||
errors.append("Small table threshold must be greater than 0")
|
||||
|
||||
if self.data_quality.medium_table_threshold <= 0:
|
||||
errors.append("Medium table threshold must be greater than 0")
|
||||
|
||||
if self.data_quality.small_table_threshold >= self.data_quality.medium_table_threshold:
|
||||
errors.append("Small table threshold must be less than medium table threshold")
|
||||
|
||||
if self.data_quality.batch_timeout <= 0:
|
||||
errors.append("Batch timeout must be greater than 0")
|
||||
|
||||
if self.data_quality.fast_mode_sample_size <= 0:
|
||||
errors.append("Fast mode sample size must be greater than 0")
|
||||
|
||||
if self.data_quality.histogram_bins <= 0:
|
||||
errors.append("Histogram bins must be greater than 0")
|
||||
|
||||
# Validate logging configuration
|
||||
if self.logging.level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
||||
errors.append("Log level must be one of DEBUG, INFO, WARNING, ERROR, or CRITICAL")
|
||||
|
||||
Reference in New Issue
Block a user