[improvement]Add bucket information in the output of analyze_table_storage (#33)

This commit is contained in:
ivin
2025-07-29 14:04:36 +08:00
committed by GitHub
parent 9bb5b17199
commit fb5e864a24

View File

@@ -25,7 +25,7 @@ import time
import math
import statistics
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, cast
from collections import Counter, defaultdict
from .db import DorisConnectionManager
@@ -377,6 +377,54 @@ class DataQualityTools:
logger.warning(f"Failed to get table partitions: {str(e)}")
return []
async def _get_table_bucket_info(self, connection, table_name: str, db_name: Optional[str] = None) -> Optional[Dict]:
"""Get table buckets information"""
try:
# Query bucket information
ddl_statement = await self._get_table_ddl(connection, table_name, db_name)
if not ddl_statement:
logger.error(f"Could not retrieve DDL for table {table_name}.")
return None
pattern = r"DISTRIBUTED BY (HASH\(([^)]+)\)|RANDOM) BUCKETS (\d+|AUTO)"
matches = re.findall(pattern, cast(str, ddl_statement))
if matches:
dist_type, columns, buckets = matches[0]
column_list = [col.strip().strip("`") for col in columns.split(",")]
if dist_type.startswith('HASH'):
return {
"type": "HASH",
"columns": column_list,
"bucket_num": buckets,
}
else:
return {
"type": "RANDOM",
"bucket_num": buckets,
}
except Exception as e:
logger.warning(f"Failed to get table buckets: {str(e)}")
return None
async def _get_table_ddl(
self, connection, table_name: str, db_name: Optional[str]
) -> Optional[str]:
"""Get table DDL statement"""
try:
query = (
f"SHOW CREATE TABLE {db_name}.{table_name}"
if db_name
else f"SHOW CREATE TABLE {table_name}"
)
result = await connection.execute(query)
if result.data:
return result.data[0].get("Create Table")
return None
except Exception as e:
logger.error(f"Error getting DDL for table {table_name}: {e}")
return None
async def _get_table_size_info(self, connection, table_name: str) -> Dict[str, Any]:
"""Get table size information"""
try:
@@ -1008,7 +1056,13 @@ class DataQualityTools:
else:
partition_analysis["balance_score"] = 1.0 if len(row_counts) == 1 else 0.0
return partition_analysis
# Get bucket information
bucket_info = await self._get_table_bucket_info(connection, table_name, db_name)
return {
"partition_analysis": partition_analysis,
"bucket_analysis": bucket_info
}
except Exception as e:
logger.warning(f"Failed to analyze physical distribution: {str(e)}")