From bc04f0dc0d35f9c0e40a00cbd2bb04e83f44faa0 Mon Sep 17 00:00:00 2001 From: chmill Date: Mon, 23 Mar 2026 23:10:43 +0000 Subject: [PATCH 1/7] fix: adjust node exporter tls to match what was default behavior in extension --- .../etc/node-exporter.d/web-config.yml | 8 +- .../node-exporter/node-exporter-startup.sh | 113 +++++++----------- 2 files changed, 45 insertions(+), 76 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml b/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml index 07ddffea52e..86659cecde4 100644 --- a/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml +++ b/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml @@ -1,5 +1,3 @@ -tls_server_config: - cert_file: "/etc/kubernetes/certs/kubeletserver.crt" - key_file: "/etc/kubernetes/certs/kubeletserver.key" - client_auth_type: "RequireAndVerifyClientCert" - client_ca_file: "/etc/kubernetes/certs/ca.crt" +# TLS is disabled by default. To enable, set NODE_EXPORTER_TLS_ENABLED=true +# in /etc/default/node-exporter. This file will be overwritten by the startup +# script when TLS is enabled. diff --git a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh index e5a3613e30d..c5d901ba1dd 100755 --- a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh +++ b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh @@ -9,85 +9,56 @@ fi TLS_CONFIG_PATH="/etc/node-exporter.d/web-config.yml" TLS_CONFIG_ARG="" -# Ensure TLS config directory exists -mkdir -p "$(dirname "$TLS_CONFIG_PATH")" - -# Check IMDS tag to determine cert rotation setting (same logic as CSE) -# If aks-disable-kubelet-serving-certificate-rotation=true, use static certs -# Otherwise, use rotation cert -ROTATION_DISABLED="false" -IMDS_CACHE_FILE="/opt/azure/containers/imds_instance_metadata_cache.json" - -# Use CSE's cached IMDS response if available, otherwise fetch directly -if [ -f "$IMDS_CACHE_FILE" ]; then - IMDS_RESPONSE=$(cat "$IMDS_CACHE_FILE") -else - IMDS_RESPONSE=$(curl -fsSL -H "Metadata: true" --noproxy "*" --retry 20 --retry-delay 2 --retry-connrefused --connect-timeout 5 --max-time 60 "http://169.254.169.254/metadata/instance?api-version=2021-02-01" 2>/dev/null) -fi - -if [ -z "$IMDS_RESPONSE" ]; then - echo "WARNING: Failed to fetch IMDS metadata, assuming cert rotation is enabled" -fi - -if [ -n "$IMDS_RESPONSE" ]; then - ROTATION_DISABLED=$(echo "$IMDS_RESPONSE" | jq -r '.compute.tagsList | map(select(.name | test("aks-disable-kubelet-serving-certificate-rotation"; "i")))[0].value // "false" | test("true"; "i")' 2>/dev/null || echo "false") -fi - -# Wait for the appropriate cert to exist (max 5 minutes) -# Certs are created by kubelet during its bootstrap process when it connects -# to the API server, so they may not exist immediately at boot time -WAIT_TIMEOUT=300 -WAIT_INTERVAL=5 -WAIT_ELAPSED=0 - -while [ $WAIT_ELAPSED -lt $WAIT_TIMEOUT ]; do - if [ "$ROTATION_DISABLED" = "true" ]; then - # Rotation disabled - wait for static certs - if [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; then - break - fi +# TLS is disabled by default for backward compatibility: +# - AKS control plane Prometheus scrapes node-exporter via the API server proxy, +# which connects to backends over plain HTTP. Enabling TLS breaks this path. +# - The old node-exporter VM extension also defaulted to no TLS. +# +# To enable TLS, set NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter. +# Optionally set NODE_EXPORTER_TLS_CLIENT_AUTH to control client cert requirements +# (default: NoClientCert). Valid values: NoClientCert, RequireAndVerifyClientCert, +# RequireAnyClientCert, VerifyClientCertIfGiven. +if [ "${NODE_EXPORTER_TLS_ENABLED:-false}" = "true" ]; then + mkdir -p "$(dirname "$TLS_CONFIG_PATH")" + + TLS_CLIENT_AUTH="${NODE_EXPORTER_TLS_CLIENT_AUTH:-NoClientCert}" + + # Detect TLS cert paths + # Priority: rotation cert > static certs + CERT_FILE="" + KEY_FILE="" + + if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then + CERT_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" + KEY_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" + echo "Using kubelet serving certificate rotation cert: $CERT_FILE" + elif [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; then + CERT_FILE="/etc/kubernetes/certs/kubeletserver.crt" + KEY_FILE="/etc/kubernetes/certs/kubeletserver.key" + echo "Using static kubelet serving certs: $CERT_FILE, $KEY_FILE" else - # Rotation enabled - wait for rotation cert - if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then - break - fi + echo "WARNING: TLS enabled but no kubelet serving certs found. node-exporter will run without TLS." fi - # Also check the other cert type in case IMDS tag detection was wrong - if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ] || \ - { [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; }; then - break - fi - sleep $WAIT_INTERVAL - WAIT_ELAPSED=$((WAIT_ELAPSED + WAIT_INTERVAL)) -done - -# Detect TLS cert paths -# Priority: rotation cert > static certs > skip TLS -CERT_FILE="" -KEY_FILE="" - -if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then - CERT_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" - KEY_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" - echo "Using kubelet serving certificate rotation cert: $CERT_FILE" -elif [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; then - CERT_FILE="/etc/kubernetes/certs/kubeletserver.crt" - KEY_FILE="/etc/kubernetes/certs/kubeletserver.key" - echo "Using static kubelet serving certs: $CERT_FILE, $KEY_FILE" -else - echo "WARNING: No kubelet serving certs found after ${WAIT_TIMEOUT}s, node-exporter will run without TLS. Restart the service after certs are available to enable TLS." -fi -# Configure TLS if we found valid cert paths -if [ -n "$CERT_FILE" ] && [ -n "$KEY_FILE" ]; then - cat > "$TLS_CONFIG_PATH" < "$TLS_CONFIG_PATH" < "$TLS_CONFIG_PATH" < Date: Mon, 23 Mar 2026 23:21:21 +0000 Subject: [PATCH 2/7] addressing comments. fix wait for certs, client_auth_type, webconfig comment --- .../etc/node-exporter.d/web-config.yml | 2 +- .../node-exporter/node-exporter-startup.sh | 21 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml b/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml index 86659cecde4..d6b83d0237c 100644 --- a/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml +++ b/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml @@ -1,3 +1,3 @@ # TLS is disabled by default. To enable, set NODE_EXPORTER_TLS_ENABLED=true # in /etc/default/node-exporter. This file will be overwritten by the startup -# script when TLS is enabled. +# script when TLS is enabled and valid serving certificates are available. diff --git a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh index c5d901ba1dd..64b4de33c0c 100755 --- a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh +++ b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh @@ -23,6 +23,22 @@ if [ "${NODE_EXPORTER_TLS_ENABLED:-false}" = "true" ]; then TLS_CLIENT_AUTH="${NODE_EXPORTER_TLS_CLIENT_AUTH:-NoClientCert}" + # Wait for kubelet serving certs to exist (max 5 minutes). + # Certs are created by kubelet during bootstrap and may not exist at boot time. + WAIT_TIMEOUT=300 + WAIT_INTERVAL=5 + WAIT_ELAPSED=0 + + while [ $WAIT_ELAPSED -lt $WAIT_TIMEOUT ]; do + if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ] || \ + { [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; }; then + break + fi + echo "Waiting for kubelet serving certs... (${WAIT_ELAPSED}s/${WAIT_TIMEOUT}s)" + sleep $WAIT_INTERVAL + WAIT_ELAPSED=$((WAIT_ELAPSED + WAIT_INTERVAL)) + done + # Detect TLS cert paths # Priority: rotation cert > static certs CERT_FILE="" @@ -37,7 +53,7 @@ if [ "${NODE_EXPORTER_TLS_ENABLED:-false}" = "true" ]; then KEY_FILE="/etc/kubernetes/certs/kubeletserver.key" echo "Using static kubelet serving certs: $CERT_FILE, $KEY_FILE" else - echo "WARNING: TLS enabled but no kubelet serving certs found. node-exporter will run without TLS." + echo "WARNING: TLS enabled but no kubelet serving certs found after ${WAIT_TIMEOUT}s. node-exporter will run without TLS." fi if [ -n "$CERT_FILE" ] && [ -n "$KEY_FILE" ]; then @@ -54,9 +70,10 @@ EOF tls_server_config: cert_file: "$CERT_FILE" key_file: "$KEY_FILE" + client_auth_type: "NoClientCert" EOF fi - echo "TLS configured: client_auth_type=$TLS_CLIENT_AUTH" + echo "TLS configured with client_auth_type=$TLS_CLIENT_AUTH, cert=$CERT_FILE" TLS_CONFIG_ARG="--web.config.file=${TLS_CONFIG_PATH}" fi fi From f22baaba3e8c52281f7e2405f7c81d4b93017f83 Mon Sep 17 00:00:00 2001 From: chmill Date: Tue, 24 Mar 2026 00:54:06 +0000 Subject: [PATCH 3/7] need to update validation as well i guess --- e2e/validators.go | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index a81cc352d5c..503c865a2f6 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1509,30 +1509,18 @@ func ValidateNodeExporter(ctx context.Context, s *Scenario) { ValidateFileExists(ctx, s, skipFile) ValidateFileExists(ctx, s, "/etc/node-exporter.d/web-config.yml") - // Validate that node-exporter is listening on port 19100 - // We verify the port is open using ss/netstat rather than making a full mTLS request, - // since the e2e test environment may not have the correct client certs set up. - // The mTLS configuration is validated by checking that the web-config.yml exists - // and contains the expected TLS settings. - s.T.Logf("Validating node-exporter is listening on port 19100") + // Validate that node-exporter is listening on port 19100 and serving metrics. + // TLS is disabled by default (opt-in via NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter), + // so we validate by making a plain HTTP request to the metrics endpoint. + s.T.Logf("Validating node-exporter is listening on port 19100 and serving metrics") command := []string{ "set -ex", - "NODE_IP=$(hostname -I | awk '{print $1}')", // Verify node-exporter is listening on port 19100 "ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'", + // Verify node-exporter responds to HTTP requests and returns Prometheus metrics + "curl -sf http://localhost:19100/metrics | grep -q 'node_'", } - execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100") - - // Verify the web-config.yml has proper TLS configuration - s.T.Logf("Validating node-exporter TLS configuration") - tlsCommand := []string{ - "set -ex", - // Verify web-config.yml contains TLS settings - "grep -q 'tls_server_config' /etc/node-exporter.d/web-config.yml", - "grep -q 'client_auth_type' /etc/node-exporter.d/web-config.yml", - "grep -q 'client_ca_file' /etc/node-exporter.d/web-config.yml", - } - execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(tlsCommand, "\n"), 0, "node-exporter TLS config should be properly configured") + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100 and serving metrics over HTTP") s.T.Logf("node-exporter validation passed") } From 25ca5ead0c8fb056a30dbcb84b4e4622ad32b67a Mon Sep 17 00:00:00 2001 From: chmill Date: Tue, 24 Mar 2026 01:47:10 +0000 Subject: [PATCH 4/7] test looking for ip no localhost. handle tlsclient better changing to allowlist and defaulting back when failing --- e2e/validators.go | 6 ++++-- .../node-exporter/node-exporter-startup.sh | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 503c865a2f6..9c81052fdda 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1512,13 +1512,15 @@ func ValidateNodeExporter(ctx context.Context, s *Scenario) { // Validate that node-exporter is listening on port 19100 and serving metrics. // TLS is disabled by default (opt-in via NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter), // so we validate by making a plain HTTP request to the metrics endpoint. + // Note: node-exporter binds to the node's IP (not 0.0.0.0), so we must use the node IP, not localhost. s.T.Logf("Validating node-exporter is listening on port 19100 and serving metrics") command := []string{ "set -ex", // Verify node-exporter is listening on port 19100 "ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'", - // Verify node-exporter responds to HTTP requests and returns Prometheus metrics - "curl -sf http://localhost:19100/metrics | grep -q 'node_'", + // Resolve the node IP the same way the startup script does, then curl the metrics endpoint + "NODE_IP=$(hostname -I | awk '{print $1}')", + "curl -sf http://${NODE_IP}:19100/metrics | grep -q 'node_'", } execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100 and serving metrics over HTTP") diff --git a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh index 64b4de33c0c..37299a6ca8e 100755 --- a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh +++ b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh @@ -16,13 +16,22 @@ TLS_CONFIG_ARG="" # # To enable TLS, set NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter. # Optionally set NODE_EXPORTER_TLS_CLIENT_AUTH to control client cert requirements -# (default: NoClientCert). Valid values: NoClientCert, RequireAndVerifyClientCert, -# RequireAnyClientCert, VerifyClientCertIfGiven. +# (default: NoClientCert). Valid values: NoClientCert, RequestClientCert, +# RequireAnyClientCert, VerifyClientCertIfGiven, RequireAndVerifyClientCert. if [ "${NODE_EXPORTER_TLS_ENABLED:-false}" = "true" ]; then mkdir -p "$(dirname "$TLS_CONFIG_PATH")" TLS_CLIENT_AUTH="${NODE_EXPORTER_TLS_CLIENT_AUTH:-NoClientCert}" + # Validate client auth type against supported values + case "$TLS_CLIENT_AUTH" in + NoClientCert|RequestClientCert|RequireAnyClientCert|VerifyClientCertIfGiven|RequireAndVerifyClientCert) ;; + *) + echo "WARNING: unsupported NODE_EXPORTER_TLS_CLIENT_AUTH='$TLS_CLIENT_AUTH', defaulting to NoClientCert" + TLS_CLIENT_AUTH="NoClientCert" + ;; + esac + # Wait for kubelet serving certs to exist (max 5 minutes). # Certs are created by kubelet during bootstrap and may not exist at boot time. WAIT_TIMEOUT=300 @@ -57,7 +66,7 @@ if [ "${NODE_EXPORTER_TLS_ENABLED:-false}" = "true" ]; then fi if [ -n "$CERT_FILE" ] && [ -n "$KEY_FILE" ]; then - if [ "$TLS_CLIENT_AUTH" = "RequireAndVerifyClientCert" ] || [ "$TLS_CLIENT_AUTH" = "RequireAnyClientCert" ] || [ "$TLS_CLIENT_AUTH" = "VerifyClientCertIfGiven" ]; then + if [ "$TLS_CLIENT_AUTH" != "NoClientCert" ]; then cat > "$TLS_CONFIG_PATH" < Date: Tue, 24 Mar 2026 03:31:03 +0000 Subject: [PATCH 5/7] apparently hostname -I is illegal in some places --- e2e/validators.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 9c81052fdda..af66b259718 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1512,14 +1512,13 @@ func ValidateNodeExporter(ctx context.Context, s *Scenario) { // Validate that node-exporter is listening on port 19100 and serving metrics. // TLS is disabled by default (opt-in via NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter), // so we validate by making a plain HTTP request to the metrics endpoint. - // Note: node-exporter binds to the node's IP (not 0.0.0.0), so we must use the node IP, not localhost. s.T.Logf("Validating node-exporter is listening on port 19100 and serving metrics") command := []string{ "set -ex", // Verify node-exporter is listening on port 19100 "ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'", - // Resolve the node IP the same way the startup script does, then curl the metrics endpoint - "NODE_IP=$(hostname -I | awk '{print $1}')", + // Resolve the node IP this shoudl work regardless of distro... then validate we can get metrics from node-exporter + "NODE_IP=$(ip -o -4 addr show dev eth0 | awk '{print $4}' | cut -d '/' -f 1)", "curl -sf http://${NODE_IP}:19100/metrics | grep -q 'node_'", } execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100 and serving metrics over HTTP") From 4cd6c51b52e39074e62aa3b3a2db7a8774c21bde Mon Sep 17 00:00:00 2001 From: chmill Date: Tue, 24 Mar 2026 05:03:02 +0000 Subject: [PATCH 6/7] going with this because validator env is silly and this should work --- e2e/validators.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index af66b259718..530d138a674 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1515,11 +1515,9 @@ func ValidateNodeExporter(ctx context.Context, s *Scenario) { s.T.Logf("Validating node-exporter is listening on port 19100 and serving metrics") command := []string{ "set -ex", - // Verify node-exporter is listening on port 19100 - "ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'", - // Resolve the node IP this shoudl work regardless of distro... then validate we can get metrics from node-exporter - "NODE_IP=$(ip -o -4 addr show dev eth0 | awk '{print $4}' | cut -d '/' -f 1)", - "curl -sf http://${NODE_IP}:19100/metrics | grep -q 'node_'", + // Extract the listen address directly from ss — no hostname resolution needed. + "LISTEN_ADDR=$(ss -tlnp | grep ':19100' | awk '{print $4}' | head -1)", + "curl -sf http://${LISTEN_ADDR}/metrics | grep -q 'node_'", } execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100 and serving metrics over HTTP") From 7993d35aa6e4b0e68d57b6d803995283f50132e5 Mon Sep 17 00:00:00 2001 From: chmill Date: Tue, 24 Mar 2026 06:23:29 +0000 Subject: [PATCH 7/7] tests are the hard part --- e2e/validators.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 530d138a674..d0fae6f3ca0 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1515,8 +1515,8 @@ func ValidateNodeExporter(ctx context.Context, s *Scenario) { s.T.Logf("Validating node-exporter is listening on port 19100 and serving metrics") command := []string{ "set -ex", - // Extract the listen address directly from ss — no hostname resolution needed. - "LISTEN_ADDR=$(ss -tlnp | grep ':19100' | awk '{print $4}' | head -1)", + // Extract the listen address from ss, replacing wildcard '*' or '0.0.0.0' with localhost. + "LISTEN_ADDR=$(ss -tlnp | grep ':19100' | awk '{print $4}' | head -1 | sed 's/^\\*/127.0.0.1/; s/^0\\.0\\.0\\.0/127.0.0.1/')", "curl -sf http://${LISTEN_ADDR}/metrics | grep -q 'node_'", } execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100 and serving metrics over HTTP")