diff --git a/e2e/validators.go b/e2e/validators.go index a81cc352d5c..d0fae6f3ca0 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1509,30 +1509,17 @@ func ValidateNodeExporter(ctx context.Context, s *Scenario) { ValidateFileExists(ctx, s, skipFile) ValidateFileExists(ctx, s, "/etc/node-exporter.d/web-config.yml") - // Validate that node-exporter is listening on port 19100 - // We verify the port is open using ss/netstat rather than making a full mTLS request, - // since the e2e test environment may not have the correct client certs set up. - // The mTLS configuration is validated by checking that the web-config.yml exists - // and contains the expected TLS settings. - s.T.Logf("Validating node-exporter is listening on port 19100") + // Validate that node-exporter is listening on port 19100 and serving metrics. + // TLS is disabled by default (opt-in via NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter), + // so we validate by making a plain HTTP request to the metrics endpoint. + s.T.Logf("Validating node-exporter is listening on port 19100 and serving metrics") command := []string{ "set -ex", - "NODE_IP=$(hostname -I | awk '{print $1}')", - // Verify node-exporter is listening on port 19100 - "ss -tlnp | grep -q ':19100' || netstat -tlnp | grep -q ':19100'", + // Extract the listen address from ss, replacing wildcard '*' or '0.0.0.0' with localhost. + "LISTEN_ADDR=$(ss -tlnp | grep ':19100' | awk '{print $4}' | head -1 | sed 's/^\\*/127.0.0.1/; s/^0\\.0\\.0\\.0/127.0.0.1/')", + "curl -sf http://${LISTEN_ADDR}/metrics | grep -q 'node_'", } - execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100") - - // Verify the web-config.yml has proper TLS configuration - s.T.Logf("Validating node-exporter TLS configuration") - tlsCommand := []string{ - "set -ex", - // Verify web-config.yml contains TLS settings - "grep -q 'tls_server_config' /etc/node-exporter.d/web-config.yml", - "grep -q 'client_auth_type' /etc/node-exporter.d/web-config.yml", - "grep -q 'client_ca_file' /etc/node-exporter.d/web-config.yml", - } - execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(tlsCommand, "\n"), 0, "node-exporter TLS config should be properly configured") + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter should be listening on port 19100 and serving metrics over HTTP") s.T.Logf("node-exporter validation passed") } diff --git a/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml b/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml index 07ddffea52e..d6b83d0237c 100644 --- a/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml +++ b/parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml @@ -1,5 +1,3 @@ -tls_server_config: - cert_file: "/etc/kubernetes/certs/kubeletserver.crt" - key_file: "/etc/kubernetes/certs/kubeletserver.key" - client_auth_type: "RequireAndVerifyClientCert" - client_ca_file: "/etc/kubernetes/certs/ca.crt" +# TLS is disabled by default. To enable, set NODE_EXPORTER_TLS_ENABLED=true +# in /etc/default/node-exporter. This file will be overwritten by the startup +# script when TLS is enabled and valid serving certificates are available. diff --git a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh index e5a3613e30d..37299a6ca8e 100755 --- a/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh +++ b/parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh @@ -9,85 +9,82 @@ fi TLS_CONFIG_PATH="/etc/node-exporter.d/web-config.yml" TLS_CONFIG_ARG="" -# Ensure TLS config directory exists -mkdir -p "$(dirname "$TLS_CONFIG_PATH")" - -# Check IMDS tag to determine cert rotation setting (same logic as CSE) -# If aks-disable-kubelet-serving-certificate-rotation=true, use static certs -# Otherwise, use rotation cert -ROTATION_DISABLED="false" -IMDS_CACHE_FILE="/opt/azure/containers/imds_instance_metadata_cache.json" - -# Use CSE's cached IMDS response if available, otherwise fetch directly -if [ -f "$IMDS_CACHE_FILE" ]; then - IMDS_RESPONSE=$(cat "$IMDS_CACHE_FILE") -else - IMDS_RESPONSE=$(curl -fsSL -H "Metadata: true" --noproxy "*" --retry 20 --retry-delay 2 --retry-connrefused --connect-timeout 5 --max-time 60 "http://169.254.169.254/metadata/instance?api-version=2021-02-01" 2>/dev/null) -fi - -if [ -z "$IMDS_RESPONSE" ]; then - echo "WARNING: Failed to fetch IMDS metadata, assuming cert rotation is enabled" -fi - -if [ -n "$IMDS_RESPONSE" ]; then - ROTATION_DISABLED=$(echo "$IMDS_RESPONSE" | jq -r '.compute.tagsList | map(select(.name | test("aks-disable-kubelet-serving-certificate-rotation"; "i")))[0].value // "false" | test("true"; "i")' 2>/dev/null || echo "false") -fi - -# Wait for the appropriate cert to exist (max 5 minutes) -# Certs are created by kubelet during its bootstrap process when it connects -# to the API server, so they may not exist immediately at boot time -WAIT_TIMEOUT=300 -WAIT_INTERVAL=5 -WAIT_ELAPSED=0 - -while [ $WAIT_ELAPSED -lt $WAIT_TIMEOUT ]; do - if [ "$ROTATION_DISABLED" = "true" ]; then - # Rotation disabled - wait for static certs - if [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; then +# TLS is disabled by default for backward compatibility: +# - AKS control plane Prometheus scrapes node-exporter via the API server proxy, +# which connects to backends over plain HTTP. Enabling TLS breaks this path. +# - The old node-exporter VM extension also defaulted to no TLS. +# +# To enable TLS, set NODE_EXPORTER_TLS_ENABLED=true in /etc/default/node-exporter. +# Optionally set NODE_EXPORTER_TLS_CLIENT_AUTH to control client cert requirements +# (default: NoClientCert). Valid values: NoClientCert, RequestClientCert, +# RequireAnyClientCert, VerifyClientCertIfGiven, RequireAndVerifyClientCert. +if [ "${NODE_EXPORTER_TLS_ENABLED:-false}" = "true" ]; then + mkdir -p "$(dirname "$TLS_CONFIG_PATH")" + + TLS_CLIENT_AUTH="${NODE_EXPORTER_TLS_CLIENT_AUTH:-NoClientCert}" + + # Validate client auth type against supported values + case "$TLS_CLIENT_AUTH" in + NoClientCert|RequestClientCert|RequireAnyClientCert|VerifyClientCertIfGiven|RequireAndVerifyClientCert) ;; + *) + echo "WARNING: unsupported NODE_EXPORTER_TLS_CLIENT_AUTH='$TLS_CLIENT_AUTH', defaulting to NoClientCert" + TLS_CLIENT_AUTH="NoClientCert" + ;; + esac + + # Wait for kubelet serving certs to exist (max 5 minutes). + # Certs are created by kubelet during bootstrap and may not exist at boot time. + WAIT_TIMEOUT=300 + WAIT_INTERVAL=5 + WAIT_ELAPSED=0 + + while [ $WAIT_ELAPSED -lt $WAIT_TIMEOUT ]; do + if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ] || \ + { [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; }; then break fi + echo "Waiting for kubelet serving certs... (${WAIT_ELAPSED}s/${WAIT_TIMEOUT}s)" + sleep $WAIT_INTERVAL + WAIT_ELAPSED=$((WAIT_ELAPSED + WAIT_INTERVAL)) + done + + # Detect TLS cert paths + # Priority: rotation cert > static certs + CERT_FILE="" + KEY_FILE="" + + if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then + CERT_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" + KEY_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" + echo "Using kubelet serving certificate rotation cert: $CERT_FILE" + elif [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; then + CERT_FILE="/etc/kubernetes/certs/kubeletserver.crt" + KEY_FILE="/etc/kubernetes/certs/kubeletserver.key" + echo "Using static kubelet serving certs: $CERT_FILE, $KEY_FILE" else - # Rotation enabled - wait for rotation cert - if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then - break - fi + echo "WARNING: TLS enabled but no kubelet serving certs found after ${WAIT_TIMEOUT}s. node-exporter will run without TLS." fi - # Also check the other cert type in case IMDS tag detection was wrong - if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ] || \ - { [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; }; then - break - fi - sleep $WAIT_INTERVAL - WAIT_ELAPSED=$((WAIT_ELAPSED + WAIT_INTERVAL)) -done - -# Detect TLS cert paths -# Priority: rotation cert > static certs > skip TLS -CERT_FILE="" -KEY_FILE="" - -if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then - CERT_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" - KEY_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem" - echo "Using kubelet serving certificate rotation cert: $CERT_FILE" -elif [ -f "/etc/kubernetes/certs/kubeletserver.crt" ] && [ -f "/etc/kubernetes/certs/kubeletserver.key" ]; then - CERT_FILE="/etc/kubernetes/certs/kubeletserver.crt" - KEY_FILE="/etc/kubernetes/certs/kubeletserver.key" - echo "Using static kubelet serving certs: $CERT_FILE, $KEY_FILE" -else - echo "WARNING: No kubelet serving certs found after ${WAIT_TIMEOUT}s, node-exporter will run without TLS. Restart the service after certs are available to enable TLS." -fi -# Configure TLS if we found valid cert paths -if [ -n "$CERT_FILE" ] && [ -n "$KEY_FILE" ]; then - cat > "$TLS_CONFIG_PATH" < "$TLS_CONFIG_PATH" < "$TLS_CONFIG_PATH" <