diff --git a/internal/components/monitor/helpers.go b/internal/components/monitor/helpers.go new file mode 100644 index 0000000..1b958d3 --- /dev/null +++ b/internal/components/monitor/helpers.go @@ -0,0 +1,44 @@ +package monitor + +import ( + "fmt" + "slices" +) + +// supportedMonitoringOperations defines all supported monitoring operations +var supportedMonitoringOperations = []string{ + string(OpMetrics), string(OpResourceHealth), string(OpAppInsights), + string(OpDiagnostics), string(OpControlPlaneLogs), +} + +// ValidateMonitoringOperation checks if the monitoring operation is supported +func ValidateMonitoringOperation(operation string) bool { + return slices.Contains(supportedMonitoringOperations, operation) +} + +// GetSupportedMonitoringOperations returns all supported monitoring operations +func GetSupportedMonitoringOperations() []string { + return supportedMonitoringOperations +} + +// ValidateMetricsQueryType checks if the metrics query type is supported +func ValidateMetricsQueryType(queryType string) bool { + supportedTypes := []string{"list", "list-definitions", "list-namespaces"} + return slices.Contains(supportedTypes, queryType) +} + +// MapMetricsQueryTypeToCommand maps a metrics query type to its corresponding az command +func MapMetricsQueryTypeToCommand(queryType string) (string, error) { + commandMap := map[string]string{ + "list": "az monitor metrics list", + "list-definitions": "az monitor metrics list-definitions", + "list-namespaces": "az monitor metrics list-namespaces", + } + + cmd, exists := commandMap[queryType] + if !exists { + return "", fmt.Errorf("unsupported metrics query type '%s'. Supported types: list, list-definitions, list-namespaces", queryType) + } + + return cmd, nil +} diff --git a/internal/components/monitor/registry.go b/internal/components/monitor/registry.go index fb1b321..9158597 100644 --- a/internal/components/monitor/registry.go +++ b/internal/components/monitor/registry.go @@ -1,9 +1,6 @@ package monitor import ( - "fmt" - "slices" - "github.com/mark3labs/mcp-go/mcp" ) @@ -22,84 +19,106 @@ const ( func RegisterAzMonitoring() mcp.Tool { description := `Unified tool for Azure monitoring and diagnostics operations for AKS clusters. -Supported operations: -- metrics: Query metrics for Azure resources (list, list-definitions, list-namespaces) -- resource_health: Get resource health events for AKS clusters -- app_insights: Execute KQL queries against Application Insights data -- diagnostics: Check AKS cluster diagnostic settings configuration -- control_plane_logs: Query AKS control plane logs with safety constraints +Supported Operations: + +1. Metrics - Query Azure Monitor metrics for AKS clusters and nodes + - list: Get metric values for specific metrics + - list-definitions: Get available metrics for a resource + - list-namespaces: Get metric namespaces for a resource + + Use for: CPU usage, memory consumption, network traffic, pod counts, node health + Required parameters: resource (Azure resource ID) + Additional for 'list': metrics (metric names) + Optional: aggregation, start-time, end-time, interval, filter + +2. Resource Health - Get Azure Resource Health events for AKS clusters + Use for: Cluster availability issues, platform problems, service health events + Required parameters: subscription_id, resource_group, cluster_name, start_time + Optional: end_time, status (Available, Unavailable, Degraded, Unknown) + +3. Application Insights - Execute KQL queries against Application Insights telemetry + Use for: Application performance monitoring, custom telemetry analysis, trace correlation + Required parameters: subscription_id, resource_group, app_insights_name, query + Optional: start_time + end_time OR timespan (not both) + +4. Diagnostics - Check AKS cluster diagnostic settings configuration + Use for: Verify logging is enabled, check log retention, validate diagnostic configuration + Required parameters: subscription_id, resource_group, cluster_name + +5. Control Plane Logs - Query AKS control plane logs + Supported log categories: + - kube-apiserver + - kube-audit + - kube-audit-admin + - kube-controller-manager + - kube-scheduler + - cluster-autoscaler + - cloud-controller-manager + - guard (for authentication/authorization issues) + - csi-azuredisk-controller + - csi-azurefile-controller + - csi-snapshot-controller + - fleet-member-agent + - fleet-member-net-controller-manager + - fleet-mcs-controller-manager + PLEASE NOTE: you need to check if the category is enabled in your cluster's diagnostic settings by using the diagnostics tool. + +Use This Tool When You Need To: +- Monitor cluster or other azure resource performance and usage (use metrics) +- Check cluster availability and platform health (use resource_health) +- Analyze application telemetry and performance (use app_insights) +- Verify diagnostic logging configuration (use diagnostics) +- Debug Kubernetes API server issues (use control_plane_logs with kube-apiserver) +- Investigate authentication/authorization problems (use control_plane_logs with kube-audit, guard) +- Troubleshoot pod scheduling issues (use control_plane_logs with kube-scheduler) +- Check storage-related problems (use control_plane_logs with csi-azuredisk-controller, csi-azurefile-controller) +- Analyze cluster scaling behavior (use control_plane_logs with cluster-autoscaler) +- Review security audit events (use control_plane_logs with kube-audit, kube-audit-admin) Examples: -- List metrics: operation="metrics", query_type="list", parameters="{\"resource\":\"\", \"metrics\": \"node_cpu_usage_percentage\", \"aggregation\": \"Average\"}" -- List metrics definitions: operation="metrics", query_type="list-definitions", parameters="{\"resource\":\"\"}" -- List metrics namespaces: operation="metrics", query_type="list-namespaces", parameters="{\"resource\":\"\"}" -- Resource health: operation="resource_health", subscription_id="", resource_group="", cluster_name="", parameters="{\"start_time\":\"2025-01-01T00:00:00Z\"}" -- App Insights query: operation="app_insights", subscription_id="", resource_group="", parameters="{\"app_insights_name\":\"...\", \"query\":\"...\"}" -- Check diagnostics: operation="diagnostics", parameters="{\"subscription_id\":\"\", \"resource_group\":\"\", \"cluster_name\":\"\"}" -- Query AKS control plane logs: operation="control_plane_logs", subscription_id="", resource_group="", cluster_name="", parameters="{\"log_category\":\"kube-apiserver\", \"start_time\":\"\", \"end_time\":\"\", \"max_records\":\"50\"}" -- Query AKS control plane logs with filters: operation="control_plane_logs", subscription_id="", resource_group="", cluster_name="", parameters="{\"log_category\":\"kube-apiserver\", \"log_level\":\"error\", \"start_time\":\"\", \"end_time\":\"\", \"max_records\":\"50\"}" + +metrics: +- Get CPU usage: operation="metrics", query_type="list", parameters="{\"resource\":\"/subscriptions/sub-id/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster\", \"metrics\":\"node_cpu_usage_percentage\", \"aggregation\":\"Average\", \"start-time\":\"\", \"end-time\":\"\"}" +- List available metrics: operation="metrics", query_type="list-definitions", parameters="{\"resource\":\"/subscriptions/sub-id/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster\"}" + +resource_health: +- Check recent cluster health: operation="resource_health", subscription_id="", resource_group="", cluster_name="", parameters="{\"start_time\":\"\"}" + +app_insights: +- Query request telemetry: operation="app_insights", subscription_id="", resource_group="", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"requests | where timestamp > ago(1h) | summarize count() by bin(timestamp, 5m)\"}" +- Analyze exceptions: operation="app_insights", subscription_id="", resource_group="", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"exceptions | where timestamp > ago(24h) | summarize count() by type, bin(timestamp, 1h)\"}" +- Performance with timespan: operation="app_insights", subscription_id="", resource_group="", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"performanceCounters | where category == 'Processor' | summarize avg(value) by bin(timestamp, 5m)\", \"timespan\":\"PT1H\"}" + +diagnostics: +- Verify diagnostic settings: operation="diagnostics", subscription_id="", resource_group="", cluster_name="", parameters="{}" + +control_plane_logs: +- Query API server logs: operation="control_plane_logs", subscription_id="", resource_group="", cluster_name="", parameters="{\"log_category\":\"kube-apiserver\", \"start_time\":\"\", \"end_time\":\"\", \"max_records\":\"50\"}" +- Debug authentication issues: operation="control_plane_logs", subscription_id="", resource_group="", cluster_name="", parameters="{\"log_category\":\"guard\", \"start_time\":\"\", \"end_time\":\"\", \"max_records\":\"100\"}" +- Analyze audit events: operation="control_plane_logs", subscription_id="", resource_group="", cluster_name="", parameters="{\"log_category\":\"kube-audit\", \"log_level\":\"error\", \"start_time\":\"\", \"end_time\":\"\", \"max_records\":\"50\"}" ` return mcp.NewTool("az_monitoring", mcp.WithDescription(description), mcp.WithString("operation", mcp.Required(), - mcp.Description("The monitoring operation to perform"), + mcp.Description("The monitoring operation to perform: 'metrics' (CPU/memory/network), 'resource_health' (cluster availability), 'app_insights' (telemetry analysis), 'diagnostics' (logging config), 'control_plane_logs' (Kubernetes logs like kube-apiserver, kube-audit, guard, etc.)"), ), mcp.WithString("query_type", - mcp.Description("Specific type of query for metrics operations (list, list-definitions, list-namespaces)"), + mcp.Description("For metrics operations only: 'list' (get metric values), 'list-definitions' (available metrics), 'list-namespaces' (metric categories)"), ), mcp.WithString("parameters", mcp.Required(), - mcp.Description("JSON string containing operation-specific parameters"), + mcp.Description("JSON string with operation parameters. metrics: resource (required), metrics (required for 'list' query_type), aggregation/start-time/end-time/interval/filter (optional). resource_health: start_time, end_time, status. app_insights: app_insights_name, query, start_time/end_time OR timespan (optional). diagnostics: none required. control_plane_logs: log_category (kube-apiserver/kube-audit/guard/etc), start_time, end_time, max_records, log_level"), ), mcp.WithString("subscription_id", - mcp.Description("Azure subscription ID (can be included in parameters)"), + mcp.Description("Azure subscription ID (required for resource_health, app_insights, diagnostics, control_plane_logs)"), ), mcp.WithString("resource_group", - mcp.Description("Resource group name (can be included in parameters)"), + mcp.Description("Resource group name (required for resource_health, app_insights, diagnostics, control_plane_logs)"), ), mcp.WithString("cluster_name", - mcp.Description("AKS cluster name (can be included in parameters)"), + mcp.Description("AKS cluster name (required for resource_health, diagnostics, control_plane_logs)"), ), ) } - -// ValidateMonitoringOperation checks if the monitoring operation is supported -func ValidateMonitoringOperation(operation string) bool { - supportedOps := []string{ - string(OpMetrics), string(OpResourceHealth), string(OpAppInsights), - string(OpDiagnostics), string(OpControlPlaneLogs), - } - return slices.Contains(supportedOps, operation) -} - -// GetSupportedMonitoringOperations returns all supported monitoring operations -func GetSupportedMonitoringOperations() []string { - return []string{ - string(OpMetrics), string(OpResourceHealth), string(OpAppInsights), - string(OpDiagnostics), string(OpControlPlaneLogs), - } -} - -// ValidateMetricsQueryType checks if the metrics query type is supported -func ValidateMetricsQueryType(queryType string) bool { - supportedTypes := []string{"list", "list-definitions", "list-namespaces"} - return slices.Contains(supportedTypes, queryType) -} - -// MapMetricsQueryTypeToCommand maps a metrics query type to its corresponding az command -func MapMetricsQueryTypeToCommand(queryType string) (string, error) { - commandMap := map[string]string{ - "list": "az monitor metrics list", - "list-definitions": "az monitor metrics list-definitions", - "list-namespaces": "az monitor metrics list-namespaces", - } - - cmd, exists := commandMap[queryType] - if !exists { - return "", fmt.Errorf("no command mapping for metrics query type: %s", queryType) - } - - return cmd, nil -}