11package monitor
22
33import (
4- "fmt"
5- "slices"
6-
74 "github.com/mark3labs/mcp-go/mcp"
85)
96
@@ -22,84 +19,106 @@ const (
2219func RegisterAzMonitoring () mcp.Tool {
2320 description := `Unified tool for Azure monitoring and diagnostics operations for AKS clusters.
2421
25- Supported operations:
26- - metrics: Query metrics for Azure resources (list, list-definitions, list-namespaces)
27- - resource_health: Get resource health events for AKS clusters
28- - app_insights: Execute KQL queries against Application Insights data
29- - diagnostics: Check AKS cluster diagnostic settings configuration
30- - control_plane_logs: Query AKS control plane logs with safety constraints
22+ Supported Operations:
23+
24+ 1. Metrics - Query Azure Monitor metrics for AKS clusters and nodes
25+ - list: Get metric values for specific metrics
26+ - list-definitions: Get available metrics for a resource
27+ - list-namespaces: Get metric namespaces for a resource
28+
29+ Use for: CPU usage, memory consumption, network traffic, pod counts, node health
30+ Required parameters: resource (Azure resource ID)
31+ Additional for 'list': metrics (metric names)
32+ Optional: aggregation, start-time, end-time, interval, filter
33+
34+ 2. Resource Health - Get Azure Resource Health events for AKS clusters
35+ Use for: Cluster availability issues, platform problems, service health events
36+ Required parameters: subscription_id, resource_group, cluster_name, start_time
37+ Optional: end_time, status (Available, Unavailable, Degraded, Unknown)
38+
39+ 3. Application Insights - Execute KQL queries against Application Insights telemetry
40+ Use for: Application performance monitoring, custom telemetry analysis, trace correlation
41+ Required parameters: subscription_id, resource_group, app_insights_name, query
42+ Optional: start_time + end_time OR timespan (not both)
43+
44+ 4. Diagnostics - Check AKS cluster diagnostic settings configuration
45+ Use for: Verify logging is enabled, check log retention, validate diagnostic configuration
46+ Required parameters: subscription_id, resource_group, cluster_name
47+
48+ 5. Control Plane Logs - Query AKS control plane logs
49+ Supported log categories:
50+ - kube-apiserver
51+ - kube-audit
52+ - kube-audit-admin
53+ - kube-controller-manager
54+ - kube-scheduler
55+ - cluster-autoscaler
56+ - cloud-controller-manager
57+ - guard (for authentication/authorization issues)
58+ - csi-azuredisk-controller
59+ - csi-azurefile-controller
60+ - csi-snapshot-controller
61+ - fleet-member-agent
62+ - fleet-member-net-controller-manager
63+ - fleet-mcs-controller-manager
64+ PLEASE NOTE: you need to check if the category is enabled in your cluster's diagnostic settings by using the diagnostics tool.
65+
66+ Use This Tool When You Need To:
67+ - Monitor cluster or other azure resource performance and usage (use metrics)
68+ - Check cluster availability and platform health (use resource_health)
69+ - Analyze application telemetry and performance (use app_insights)
70+ - Verify diagnostic logging configuration (use diagnostics)
71+ - Debug Kubernetes API server issues (use control_plane_logs with kube-apiserver)
72+ - Investigate authentication/authorization problems (use control_plane_logs with kube-audit, guard)
73+ - Troubleshoot pod scheduling issues (use control_plane_logs with kube-scheduler)
74+ - Check storage-related problems (use control_plane_logs with csi-azuredisk-controller, csi-azurefile-controller)
75+ - Analyze cluster scaling behavior (use control_plane_logs with cluster-autoscaler)
76+ - Review security audit events (use control_plane_logs with kube-audit, kube-audit-admin)
3177
3278Examples:
33- - List metrics: operation="metrics", query_type="list", parameters="{\"resource\":\"<aks-cluster-id>\", \"metrics\": \"node_cpu_usage_percentage\", \"aggregation\": \"Average\"}"
34- - List metrics definitions: operation="metrics", query_type="list-definitions", parameters="{\"resource\":\"<aks-cluster-id>\"}"
35- - List metrics namespaces: operation="metrics", query_type="list-namespaces", parameters="{\"resource\":\"<aks-cluster-id>\"}"
36- - Resource health: operation="resource_health", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"start_time\":\"2025-01-01T00:00:00Z\"}"
37- - App Insights query: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"...\", \"query\":\"...\"}"
38- - Check diagnostics: operation="diagnostics", parameters="{\"subscription_id\":\"<subscription-id>\", \"resource_group\":\"<resource-group>\", \"cluster_name\":\"<cluster-name>\"}"
39- - Query AKS control plane logs: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-apiserver\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
40- - Query AKS control plane logs with filters: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-apiserver\", \"log_level\":\"error\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
79+
80+ metrics:
81+ - Get CPU usage: operation="metrics", query_type="list", parameters="{\"resource\":\"/subscriptions/sub-id/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster\", \"metrics\":\"node_cpu_usage_percentage\", \"aggregation\":\"Average\", \"start-time\":\"<start-time>\", \"end-time\":\"<end-time>\"}"
82+ - List available metrics: operation="metrics", query_type="list-definitions", parameters="{\"resource\":\"/subscriptions/sub-id/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster\"}"
83+
84+ resource_health:
85+ - Check recent cluster health: operation="resource_health", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"start_time\":\"<start-time>\"}"
86+
87+ app_insights:
88+ - Query request telemetry: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"requests | where timestamp > ago(1h) | summarize count() by bin(timestamp, 5m)\"}"
89+ - Analyze exceptions: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"exceptions | where timestamp > ago(24h) | summarize count() by type, bin(timestamp, 1h)\"}"
90+ - Performance with timespan: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"performanceCounters | where category == 'Processor' | summarize avg(value) by bin(timestamp, 5m)\", \"timespan\":\"PT1H\"}"
91+
92+ diagnostics:
93+ - Verify diagnostic settings: operation="diagnostics", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{}"
94+
95+ control_plane_logs:
96+ - Query API server logs: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-apiserver\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
97+ - Debug authentication issues: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"guard\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"100\"}"
98+ - Analyze audit events: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-audit\", \"log_level\":\"error\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
4199`
42100
43101 return mcp .NewTool ("az_monitoring" ,
44102 mcp .WithDescription (description ),
45103 mcp .WithString ("operation" ,
46104 mcp .Required (),
47- mcp .Description ("The monitoring operation to perform" ),
105+ mcp .Description ("The monitoring operation to perform: 'metrics' (CPU/memory/network), 'resource_health' (cluster availability), 'app_insights' (telemetry analysis), 'diagnostics' (logging config), 'control_plane_logs' (Kubernetes logs like kube-apiserver, kube-audit, guard, etc.) " ),
48106 ),
49107 mcp .WithString ("query_type" ,
50- mcp .Description ("Specific type of query for metrics operations (list, list-definitions, list-namespaces)" ),
108+ mcp .Description ("For metrics operations only: 'list' (get metric values), ' list-definitions' (available metrics), ' list-namespaces' (metric categories )" ),
51109 ),
52110 mcp .WithString ("parameters" ,
53111 mcp .Required (),
54- mcp .Description ("JSON string containing operation-specific parameters" ),
112+ mcp .Description ("JSON string with operation parameters. metrics: resource (required), metrics (required for 'list' query_type), aggregation/start-time/end-time/interval/filter (optional). resource_health: start_time, end_time, status. app_insights: app_insights_name, query, start_time/end_time OR timespan (optional). diagnostics: none required. control_plane_logs: log_category (kube-apiserver/kube-audit/guard/etc), start_time, end_time, max_records, log_level " ),
55113 ),
56114 mcp .WithString ("subscription_id" ,
57- mcp .Description ("Azure subscription ID (can be included in parameters )" ),
115+ mcp .Description ("Azure subscription ID (required for resource_health, app_insights, diagnostics, control_plane_logs )" ),
58116 ),
59117 mcp .WithString ("resource_group" ,
60- mcp .Description ("Resource group name (can be included in parameters )" ),
118+ mcp .Description ("Resource group name (required for resource_health, app_insights, diagnostics, control_plane_logs )" ),
61119 ),
62120 mcp .WithString ("cluster_name" ,
63- mcp .Description ("AKS cluster name (can be included in parameters )" ),
121+ mcp .Description ("AKS cluster name (required for resource_health, diagnostics, control_plane_logs )" ),
64122 ),
65123 )
66124}
67-
68- // ValidateMonitoringOperation checks if the monitoring operation is supported
69- func ValidateMonitoringOperation (operation string ) bool {
70- supportedOps := []string {
71- string (OpMetrics ), string (OpResourceHealth ), string (OpAppInsights ),
72- string (OpDiagnostics ), string (OpControlPlaneLogs ),
73- }
74- return slices .Contains (supportedOps , operation )
75- }
76-
77- // GetSupportedMonitoringOperations returns all supported monitoring operations
78- func GetSupportedMonitoringOperations () []string {
79- return []string {
80- string (OpMetrics ), string (OpResourceHealth ), string (OpAppInsights ),
81- string (OpDiagnostics ), string (OpControlPlaneLogs ),
82- }
83- }
84-
85- // ValidateMetricsQueryType checks if the metrics query type is supported
86- func ValidateMetricsQueryType (queryType string ) bool {
87- supportedTypes := []string {"list" , "list-definitions" , "list-namespaces" }
88- return slices .Contains (supportedTypes , queryType )
89- }
90-
91- // MapMetricsQueryTypeToCommand maps a metrics query type to its corresponding az command
92- func MapMetricsQueryTypeToCommand (queryType string ) (string , error ) {
93- commandMap := map [string ]string {
94- "list" : "az monitor metrics list" ,
95- "list-definitions" : "az monitor metrics list-definitions" ,
96- "list-namespaces" : "az monitor metrics list-namespaces" ,
97- }
98-
99- cmd , exists := commandMap [queryType ]
100- if ! exists {
101- return "" , fmt .Errorf ("no command mapping for metrics query type: %s" , queryType )
102- }
103-
104- return cmd , nil
105- }
0 commit comments