Skip to content

Commit dc3bb85

Browse files
authored
Merge pull request #140 from Azure/guwe/monitoring-instruction
Chore: clear instruction for az_monitor
2 parents 7c80b0e + 411ca34 commit dc3bb85

File tree

2 files changed

+125
-62
lines changed

2 files changed

+125
-62
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package monitor
2+
3+
import (
4+
"fmt"
5+
"slices"
6+
)
7+
8+
// supportedMonitoringOperations defines all supported monitoring operations
9+
var supportedMonitoringOperations = []string{
10+
string(OpMetrics), string(OpResourceHealth), string(OpAppInsights),
11+
string(OpDiagnostics), string(OpControlPlaneLogs),
12+
}
13+
14+
// ValidateMonitoringOperation checks if the monitoring operation is supported
15+
func ValidateMonitoringOperation(operation string) bool {
16+
return slices.Contains(supportedMonitoringOperations, operation)
17+
}
18+
19+
// GetSupportedMonitoringOperations returns all supported monitoring operations
20+
func GetSupportedMonitoringOperations() []string {
21+
return supportedMonitoringOperations
22+
}
23+
24+
// ValidateMetricsQueryType checks if the metrics query type is supported
25+
func ValidateMetricsQueryType(queryType string) bool {
26+
supportedTypes := []string{"list", "list-definitions", "list-namespaces"}
27+
return slices.Contains(supportedTypes, queryType)
28+
}
29+
30+
// MapMetricsQueryTypeToCommand maps a metrics query type to its corresponding az command
31+
func MapMetricsQueryTypeToCommand(queryType string) (string, error) {
32+
commandMap := map[string]string{
33+
"list": "az monitor metrics list",
34+
"list-definitions": "az monitor metrics list-definitions",
35+
"list-namespaces": "az monitor metrics list-namespaces",
36+
}
37+
38+
cmd, exists := commandMap[queryType]
39+
if !exists {
40+
return "", fmt.Errorf("unsupported metrics query type '%s'. Supported types: list, list-definitions, list-namespaces", queryType)
41+
}
42+
43+
return cmd, nil
44+
}

internal/components/monitor/registry.go

Lines changed: 81 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
package monitor
22

33
import (
4-
"fmt"
5-
"slices"
6-
74
"github.com/mark3labs/mcp-go/mcp"
85
)
96

@@ -22,84 +19,106 @@ const (
2219
func RegisterAzMonitoring() mcp.Tool {
2320
description := `Unified tool for Azure monitoring and diagnostics operations for AKS clusters.
2421
25-
Supported operations:
26-
- metrics: Query metrics for Azure resources (list, list-definitions, list-namespaces)
27-
- resource_health: Get resource health events for AKS clusters
28-
- app_insights: Execute KQL queries against Application Insights data
29-
- diagnostics: Check AKS cluster diagnostic settings configuration
30-
- control_plane_logs: Query AKS control plane logs with safety constraints
22+
Supported Operations:
23+
24+
1. Metrics - Query Azure Monitor metrics for AKS clusters and nodes
25+
- list: Get metric values for specific metrics
26+
- list-definitions: Get available metrics for a resource
27+
- list-namespaces: Get metric namespaces for a resource
28+
29+
Use for: CPU usage, memory consumption, network traffic, pod counts, node health
30+
Required parameters: resource (Azure resource ID)
31+
Additional for 'list': metrics (metric names)
32+
Optional: aggregation, start-time, end-time, interval, filter
33+
34+
2. Resource Health - Get Azure Resource Health events for AKS clusters
35+
Use for: Cluster availability issues, platform problems, service health events
36+
Required parameters: subscription_id, resource_group, cluster_name, start_time
37+
Optional: end_time, status (Available, Unavailable, Degraded, Unknown)
38+
39+
3. Application Insights - Execute KQL queries against Application Insights telemetry
40+
Use for: Application performance monitoring, custom telemetry analysis, trace correlation
41+
Required parameters: subscription_id, resource_group, app_insights_name, query
42+
Optional: start_time + end_time OR timespan (not both)
43+
44+
4. Diagnostics - Check AKS cluster diagnostic settings configuration
45+
Use for: Verify logging is enabled, check log retention, validate diagnostic configuration
46+
Required parameters: subscription_id, resource_group, cluster_name
47+
48+
5. Control Plane Logs - Query AKS control plane logs
49+
Supported log categories:
50+
- kube-apiserver
51+
- kube-audit
52+
- kube-audit-admin
53+
- kube-controller-manager
54+
- kube-scheduler
55+
- cluster-autoscaler
56+
- cloud-controller-manager
57+
- guard (for authentication/authorization issues)
58+
- csi-azuredisk-controller
59+
- csi-azurefile-controller
60+
- csi-snapshot-controller
61+
- fleet-member-agent
62+
- fleet-member-net-controller-manager
63+
- fleet-mcs-controller-manager
64+
PLEASE NOTE: you need to check if the category is enabled in your cluster's diagnostic settings by using the diagnostics tool.
65+
66+
Use This Tool When You Need To:
67+
- Monitor cluster or other azure resource performance and usage (use metrics)
68+
- Check cluster availability and platform health (use resource_health)
69+
- Analyze application telemetry and performance (use app_insights)
70+
- Verify diagnostic logging configuration (use diagnostics)
71+
- Debug Kubernetes API server issues (use control_plane_logs with kube-apiserver)
72+
- Investigate authentication/authorization problems (use control_plane_logs with kube-audit, guard)
73+
- Troubleshoot pod scheduling issues (use control_plane_logs with kube-scheduler)
74+
- Check storage-related problems (use control_plane_logs with csi-azuredisk-controller, csi-azurefile-controller)
75+
- Analyze cluster scaling behavior (use control_plane_logs with cluster-autoscaler)
76+
- Review security audit events (use control_plane_logs with kube-audit, kube-audit-admin)
3177
3278
Examples:
33-
- List metrics: operation="metrics", query_type="list", parameters="{\"resource\":\"<aks-cluster-id>\", \"metrics\": \"node_cpu_usage_percentage\", \"aggregation\": \"Average\"}"
34-
- List metrics definitions: operation="metrics", query_type="list-definitions", parameters="{\"resource\":\"<aks-cluster-id>\"}"
35-
- List metrics namespaces: operation="metrics", query_type="list-namespaces", parameters="{\"resource\":\"<aks-cluster-id>\"}"
36-
- Resource health: operation="resource_health", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"start_time\":\"2025-01-01T00:00:00Z\"}"
37-
- App Insights query: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"...\", \"query\":\"...\"}"
38-
- Check diagnostics: operation="diagnostics", parameters="{\"subscription_id\":\"<subscription-id>\", \"resource_group\":\"<resource-group>\", \"cluster_name\":\"<cluster-name>\"}"
39-
- Query AKS control plane logs: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-apiserver\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
40-
- Query AKS control plane logs with filters: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-apiserver\", \"log_level\":\"error\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
79+
80+
metrics:
81+
- Get CPU usage: operation="metrics", query_type="list", parameters="{\"resource\":\"/subscriptions/sub-id/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster\", \"metrics\":\"node_cpu_usage_percentage\", \"aggregation\":\"Average\", \"start-time\":\"<start-time>\", \"end-time\":\"<end-time>\"}"
82+
- List available metrics: operation="metrics", query_type="list-definitions", parameters="{\"resource\":\"/subscriptions/sub-id/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/cluster\"}"
83+
84+
resource_health:
85+
- Check recent cluster health: operation="resource_health", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"start_time\":\"<start-time>\"}"
86+
87+
app_insights:
88+
- Query request telemetry: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"requests | where timestamp > ago(1h) | summarize count() by bin(timestamp, 5m)\"}"
89+
- Analyze exceptions: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"exceptions | where timestamp > ago(24h) | summarize count() by type, bin(timestamp, 1h)\"}"
90+
- Performance with timespan: operation="app_insights", subscription_id="<subscription-id>", resource_group="<resource-group>", parameters="{\"app_insights_name\":\"myapp-insights\", \"query\":\"performanceCounters | where category == 'Processor' | summarize avg(value) by bin(timestamp, 5m)\", \"timespan\":\"PT1H\"}"
91+
92+
diagnostics:
93+
- Verify diagnostic settings: operation="diagnostics", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{}"
94+
95+
control_plane_logs:
96+
- Query API server logs: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-apiserver\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
97+
- Debug authentication issues: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"guard\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"100\"}"
98+
- Analyze audit events: operation="control_plane_logs", subscription_id="<subscription-id>", resource_group="<resource-group>", cluster_name="<cluster-name>", parameters="{\"log_category\":\"kube-audit\", \"log_level\":\"error\", \"start_time\":\"<start-time>\", \"end_time\":\"<end-time>\", \"max_records\":\"50\"}"
4199
`
42100

43101
return mcp.NewTool("az_monitoring",
44102
mcp.WithDescription(description),
45103
mcp.WithString("operation",
46104
mcp.Required(),
47-
mcp.Description("The monitoring operation to perform"),
105+
mcp.Description("The monitoring operation to perform: 'metrics' (CPU/memory/network), 'resource_health' (cluster availability), 'app_insights' (telemetry analysis), 'diagnostics' (logging config), 'control_plane_logs' (Kubernetes logs like kube-apiserver, kube-audit, guard, etc.)"),
48106
),
49107
mcp.WithString("query_type",
50-
mcp.Description("Specific type of query for metrics operations (list, list-definitions, list-namespaces)"),
108+
mcp.Description("For metrics operations only: 'list' (get metric values), 'list-definitions' (available metrics), 'list-namespaces' (metric categories)"),
51109
),
52110
mcp.WithString("parameters",
53111
mcp.Required(),
54-
mcp.Description("JSON string containing operation-specific parameters"),
112+
mcp.Description("JSON string with operation parameters. metrics: resource (required), metrics (required for 'list' query_type), aggregation/start-time/end-time/interval/filter (optional). resource_health: start_time, end_time, status. app_insights: app_insights_name, query, start_time/end_time OR timespan (optional). diagnostics: none required. control_plane_logs: log_category (kube-apiserver/kube-audit/guard/etc), start_time, end_time, max_records, log_level"),
55113
),
56114
mcp.WithString("subscription_id",
57-
mcp.Description("Azure subscription ID (can be included in parameters)"),
115+
mcp.Description("Azure subscription ID (required for resource_health, app_insights, diagnostics, control_plane_logs)"),
58116
),
59117
mcp.WithString("resource_group",
60-
mcp.Description("Resource group name (can be included in parameters)"),
118+
mcp.Description("Resource group name (required for resource_health, app_insights, diagnostics, control_plane_logs)"),
61119
),
62120
mcp.WithString("cluster_name",
63-
mcp.Description("AKS cluster name (can be included in parameters)"),
121+
mcp.Description("AKS cluster name (required for resource_health, diagnostics, control_plane_logs)"),
64122
),
65123
)
66124
}
67-
68-
// ValidateMonitoringOperation checks if the monitoring operation is supported
69-
func ValidateMonitoringOperation(operation string) bool {
70-
supportedOps := []string{
71-
string(OpMetrics), string(OpResourceHealth), string(OpAppInsights),
72-
string(OpDiagnostics), string(OpControlPlaneLogs),
73-
}
74-
return slices.Contains(supportedOps, operation)
75-
}
76-
77-
// GetSupportedMonitoringOperations returns all supported monitoring operations
78-
func GetSupportedMonitoringOperations() []string {
79-
return []string{
80-
string(OpMetrics), string(OpResourceHealth), string(OpAppInsights),
81-
string(OpDiagnostics), string(OpControlPlaneLogs),
82-
}
83-
}
84-
85-
// ValidateMetricsQueryType checks if the metrics query type is supported
86-
func ValidateMetricsQueryType(queryType string) bool {
87-
supportedTypes := []string{"list", "list-definitions", "list-namespaces"}
88-
return slices.Contains(supportedTypes, queryType)
89-
}
90-
91-
// MapMetricsQueryTypeToCommand maps a metrics query type to its corresponding az command
92-
func MapMetricsQueryTypeToCommand(queryType string) (string, error) {
93-
commandMap := map[string]string{
94-
"list": "az monitor metrics list",
95-
"list-definitions": "az monitor metrics list-definitions",
96-
"list-namespaces": "az monitor metrics list-namespaces",
97-
}
98-
99-
cmd, exists := commandMap[queryType]
100-
if !exists {
101-
return "", fmt.Errorf("no command mapping for metrics query type: %s", queryType)
102-
}
103-
104-
return cmd, nil
105-
}

0 commit comments

Comments
 (0)