Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions e2e/config/vhd.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,15 @@ var (
OSDiskSizeGB: 60,
}

VHDAzureLinuxV3Gen2Arm64 = &Image{
Name: "azurelinuxv3gen2arm64",
OS: OSAzureLinux,
Arch: "arm64",
Distro: datamodel.AKSAzureLinuxV3Arm64Gen2,
Gallery: imageGalleryLinux,
OSDiskSizeGB: 60,
}

VHDACLGen2TL = &Image{
Name: "aclgen2TL",
OS: OSACL,
Expand Down
3 changes: 3 additions & 0 deletions e2e/scenario_gpu_managed_experience_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ func Test_DCGM_Exporter_Compatibility(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
RunScenario(t, &Scenario{
Description: tc.description,
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: tc.vhd,
Expand Down
19 changes: 19 additions & 0 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,25 @@ func Test_Flatcar_ARM64(t *testing.T) {
})
}

func Test_AzureLinuxV3_ARM64(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that a node using a AzureLinuxV3 VHD on ARM64 architecture can be properly bootstrapped",
Comment thread
awesomenix marked this conversation as resolved.
Comment thread
awesomenix marked this conversation as resolved.
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDAzureLinuxV3Gen2Arm64,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5"
nbc.IsARM64 = true
},
Validator: func(ctx context.Context, s *Scenario) {
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_D2pds_V5")
},
},
})
}

func Test_Flatcar_AzureCNI(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Flatcar scenario on a cluster configured with Azure CNI",
Expand Down
30 changes: 18 additions & 12 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -164,20 +164,16 @@ configureCustomCaCertificate() {
done
# blocks until svc is considered active, which will happen when ExecStart command terminates with code 0
systemctl restart update_certs.service || exit $ERR_UPDATE_CA_CERTS
# containerd has to be restarted after new certs are added to the trust store, otherwise they will not be used until restart happens
systemctl restart containerd
}

configureContainerdUlimits() {
CONTAINERD_ULIMIT_DROP_IN_FILE_PATH="/etc/systemd/system/containerd.service.d/set_ulimits.conf"
mkdir -p "$(dirname "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}")"
touch "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}"
chmod 0600 "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}"
tee "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" > /dev/null <<EOF
$(echo "$CONTAINERD_ULIMITS" | tr ' ' '\n')
EOF
Comment thread
awesomenix marked this conversation as resolved.

systemctl daemon-reload
systemctl restart containerd
}

# file paths defined outside so configureAzureJson can be unit tested
Expand Down Expand Up @@ -301,9 +297,10 @@ EOF
}

configureCNI() {
# needed for the iptables rules to work on bridges
retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit $ERR_MODPROBE_FAIL
# needed for bridge iptables rules and customer-configured conntrack sysctls
retrycmd_if_failure 120 5 25 modprobe -a br_netfilter nf_conntrack || exit $ERR_MODPROBE_FAIL
echo -n "br_netfilter" > /etc/modules-load.d/br_netfilter.conf
echo -n "nf_conntrack" > /etc/modules-load.d/nf_conntrack.conf
configureCNIIPTables
}

Expand Down Expand Up @@ -381,7 +378,7 @@ net.ipv6.conf.all.forwarding = 1
net.bridge.bridge-nf-call-iptables = 1
EOF
retrycmd_if_failure 120 5 25 sysctl --system || exit $ERR_SYSCTL_RELOAD
systemctlEnableAndStart containerd 30 || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStartNoBlock containerd 30 || exit $ERR_SYSTEMCTL_START_FAIL
Comment thread
awesomenix marked this conversation as resolved.
}

configureContainerdRegistryHost() {
Expand Down Expand Up @@ -424,6 +421,7 @@ ensureNoDupOnPromiscuBridge() {
}

ensureArtifactStreaming() {
waitForContainerdReady || exit $ERR_ARTIFACT_STREAMING_INSTALL
retrycmd_if_failure 120 5 25 time systemctl --quiet enable --now acr-mirror overlaybd-tcmu overlaybd-snapshotter
time /opt/acr/bin/acr-config --enable-containerd 'azurecr.io'
}
Expand Down Expand Up @@ -566,6 +564,8 @@ ensurePodInfraContainerImage() {
POD_INFRA_CONTAINER_IMAGE_DOWNLOAD_DIR="/opt/pod-infra-container-image/downloads"
POD_INFRA_CONTAINER_IMAGE_TAR="/opt/pod-infra-container-image/pod-infra-container-image.tar"

waitForContainerdReady || exit $ERR_PULL_POD_INFRA_CONTAINER_IMAGE

pod_infra_container_image=$(get_sandbox_image)

if [ -z "${pod_infra_container_image}" ]; then
Expand Down Expand Up @@ -776,17 +776,22 @@ EOF
logs_to_events "AKS.CSE.ensureKubelet.ensurePodInfraContainerImage" ensurePodInfraContainerImage
fi

# start measure-tls-bootstrapping-latency.service without waiting for the main process to start, while ignoring any failures
if ! systemctlEnableAndStartNoBlock measure-tls-bootstrapping-latency 30; then

@cameronmeissner cameronmeissner Mar 17, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this really adding a lot of latency? systemctlEnableAndStartNoBlock explicitly does not block, meaning systemd doesn't wait for the unit to enter a running state before returning

mainly asking since moving this below ensureKubelet seems to add a fair bit of complexity

echo "failed to start measure-tls-bootstrapping-latency.service"
fi
local tls_bootstrapping_start_time_filepath="/opt/azure/containers/tls-bootstrap-start-time"
date +"%F %T.%3N" > "${tls_bootstrapping_start_time_filepath}"

# start kubelet.service without waiting for the main process to start, though check whether it has entered a failed state after enablement
if ! systemctlEnableAndStartNoBlock kubelet 240; then
# append kubelet status to CSE output to ensure we can see it
rm -f "${tls_bootstrapping_start_time_filepath}"
journalctl -u kubelet.service --no-pager || true
exit $ERR_KUBELET_START_FAIL
fi

# start measure-tls-bootstrapping-latency.service without waiting for the main process to start, while ignoring any failures
if ! systemctlEnableAndStartNoBlock measure-tls-bootstrapping-latency 30; then
rm -f "${tls_bootstrapping_start_time_filepath}"
echo "failed to start measure-tls-bootstrapping-latency.service"
fi
Comment thread
awesomenix marked this conversation as resolved.
}

ensureSnapshotUpdate() {
Expand Down Expand Up @@ -923,6 +928,7 @@ configAzurePolicyAddon() {

configGPUDrivers() {
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL
mkdir -p /opt/{actions,gpu}
ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
Expand Down
38 changes: 27 additions & 11 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -507,19 +507,20 @@ systemctlEnableAndStart() {
service=$1; timeout=$2
systemctl_restart 100 5 $timeout $service
RESTART_STATUS=$?
systemctl status $service --no-pager -l > /var/log/azure/$service-status.log
if [ $RESTART_STATUS -ne 0 ]; then
echo "$service could not be started"
systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true
return 1
fi
if ! retrycmd_if_failure 120 5 25 systemctl enable $service; then
echo "$service could not be enabled by systemctl"
systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true
return 1
fi
}

systemctlEnableAndStartNoBlock() {
service=$1; timeout=$2; status_check_delay_seconds=${3:-"0"}
service=$1; timeout=$2

systemctl_restart_no_block 100 5 $timeout $service
RESTART_STATUS=$?
Expand All @@ -534,21 +535,36 @@ systemctlEnableAndStartNoBlock() {
systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true
return 1
fi
}
Comment on lines 522 to +538

Copilot AI Mar 23, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

systemctlEnableAndStartNoBlock uses systemctl restart --no-block, which only enqueues the job and can return success even if the unit immediately transitions to failed afterward. ensureKubelet relies on this returning non-zero to detect startup failures, so this can mask real failures (kubelet/measure-tls/containerd). Consider adding a short post-start health check (e.g., systemctl is-failed/ActiveState after a delay) or reintroducing an optional delay parameter so callers can fail fast when the unit enters failed.

Copilot uses AI. Check for mistakes.

checkServiceHealth() {
local service=$1
local state=$(systemctl show -p ActiveState --value "$service")

if [ "$state" = "active" ]; then
return 0
fi

# wait for the specified delay seconds before checking the service status to make sure
# it hasn't gone into a failed state
sleep $status_check_delay_seconds
systemctl status "$service" --no-pager -l > "/var/log/azure/$service-status.log" || true

if systemctl is-failed $service; then
if [ "$state" = "failed" ]; then
echo "$service is in a failed state"
systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true
return 1
elif [ "$state" = "activating" ]; then
echo "$service is still activating, continuing anyway..."
fi
}
Comment thread
Devinwong marked this conversation as resolved.

# systemctl status only exits with code 0 iff the service is "active",
# thus we handle the "activating" case by checking for a non-zero exit code
if ! systemctl status $service --no-pager -l > /var/log/azure/$service-status.log; then
echo "$service is still activating, continuing anyway..."
waitForContainerdReady() {
local ret=0

echo "Waiting for containerd to become ready..."
retrycmd_if_failure 60 0.1 1 bash -c 'ctr version >/dev/null 2>&1'
ret=$?
if [ "$ret" -ne 0 ]; then
echo "containerd did not become ready"
systemctl status containerd --no-pager -l > /var/log/azure/containerd-status.log || true
return 1
fi
}

Expand Down
11 changes: 8 additions & 3 deletions parts/linux/cloud-init/artifacts/cse_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -618,8 +618,12 @@ installKubeletKubectlFromURL() {
fi
fi
fi
install -m0755 "/opt/bin/kubelet-${KUBERNETES_VERSION}" /opt/bin/kubelet
install -m0755 "/opt/bin/kubectl-${KUBERNETES_VERSION}" /opt/bin/kubectl
Comment thread
cameronmeissner marked this conversation as resolved.

mv "/opt/bin/kubelet-${KUBERNETES_VERSION}" /opt/bin/kubelet
mv "/opt/bin/kubectl-${KUBERNETES_VERSION}" /opt/bin/kubectl

chown root:root /opt/bin/kubelet /opt/bin/kubectl
chmod 0755 /opt/bin/kubelet /opt/bin/kubectl

rm -rf /opt/bin/kubelet-* /opt/bin/kubectl-* /home/hyperkube-downloads &
}
Expand Down Expand Up @@ -686,8 +690,9 @@ labelContainerImage() {
}

retagMCRImagesForChina() {
waitForContainerdReady || exit $ERR_CTR_OPERATION_ERROR
# shellcheck disable=SC2016
allMCRImages=($(ctr --namespace k8s.io images list | grep '^mcr.microsoft.com/' | awk '{print $1}'))
allMCRImages=($(ctr --namespace k8s.io images list | grep '^mcr.microsoft.com/' | awk '{print $1}'))
if [ -z "${allMCRImages}" ]; then
Comment thread
awesomenix marked this conversation as resolved.
echo "failed to find mcr images for retag"
return
Expand Down
45 changes: 30 additions & 15 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,13 @@ function basePrep {
echo "Golden image; skipping dependencies installation"
fi

# Container runtime already installed on Azure Linux OS Guard
if ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then
# Container runtime already installed on Azure Linux OS Guard; an explicit containerd override can bypass FULL_INSTALL_REQUIRED for other Linux distros
if isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then
echo "Skipping installContainerRuntime because containerd is already available"
elif [ "$FULL_INSTALL_REQUIRED" = "true" ] || [ -n "${CONTAINERD_PACKAGE_URL}" ]; then
logs_to_events "AKS.CSE.installContainerRuntime" installContainerRuntime
else
echo "Skipping installContainerRuntime because containerd is already available"
fi
setupCNIDirs

Expand All @@ -170,6 +174,9 @@ function basePrep {
SHOULD_ENFORCE_KUBE_PMC_INSTALL=$(should_enforce_kube_pmc_install)
logs_to_events "AKS.CSE.configureKubeletAndKubectl" configureKubeletAndKubectl

# pre-warm kubelet by checking its version.
nohup /bin/sh -c '/opt/bin/kubelet --version >/dev/null 2>&1' >/dev/null 2>&1 &

createKubeManifestDir

if [ "${HAS_CUSTOM_SEARCH_DOMAIN}" = "true" ]; then
Expand All @@ -194,6 +201,10 @@ function basePrep {
logs_to_events "AKS.CSE.configureSystemdUseDomains" configureSystemdUseDomains
fi

if [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" = "true" ]; then
logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits
fi

# containerd should not be configured until cni has been configured first
logs_to_events "AKS.CSE.ensureContainerd" ensureContainerd

Expand Down Expand Up @@ -268,14 +279,6 @@ EOF

logs_to_events "AKS.CSE.ensureSysctl" ensureSysctl || exit $ERR_SYSCTL_RELOAD

if [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" = "true" ]; then
logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits
fi

if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" = "true" ]; then
logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge
fi

if ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then
if [ "$OS" = "$UBUNTU_OS_NAME" ] || isMarinerOrAzureLinux "$OS"; then
logs_to_events "AKS.CSE.ubuntuSnapshotUpdate" ensureSnapshotUpdate
Expand Down Expand Up @@ -351,11 +354,6 @@ function nodePrep {
# By default, never reboot new nodes.
REBOOTREQUIRED=false

# Clean up GPU drivers if not a GPU node or if skipping driver install
if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then
logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers
fi

# Install and configure GPU drivers if this is a GPU node
if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then
echo $(date),$(hostname), "Start configuring GPU drivers"
Expand Down Expand Up @@ -484,10 +482,27 @@ function nodePrep {
exit $VALIDATION_ERR
fi

checkServiceHealth containerd || exit $ERR_SYSTEMCTL_START_FAIL

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: it would be better if we had specific exit codes here, that way we can always pinpoint exactly which call to checkServiceHealth is failing

if [ "${ENABLE_SECURE_TLS_BOOTSTRAPPING}" = "true" ]; then
checkServiceHealth secure-tls-bootstrap || exit $ERR_SYSTEMCTL_START_FAIL
fi

logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we teach AI in the PR that it should report when changes are introduced before ensure Kubelet which will have direct impact on nodebootstrapping time ?


if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" = "true" ]; then
logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

weird that the service file has

[Unit]
Description=Add dedup ebtable rules for kubenet bridge in promiscuous mode
After=containerd.service
After=kubelet.service
[Service]

systemctlEnableAndStart ensure-no-dup 30 would run before, and simply wanted to auto start in case either containderd or kubelet starts.

in this case, I'm not sure if calling the enable/start will have an effect, not that we actually start containd/kubelet before ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

fi
Comment thread
awesomenix marked this conversation as resolved.

logs_to_events "AKS.CSE.configureNodeExporter" configureNodeExporter


# Clean up GPU drivers if not a GPU node or if skipping driver install
if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then
logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers
fi

checkServiceHealth kubelet || exit $ERR_KUBELET_FAIL

if $REBOOTREQUIRED; then
echo 'reboot required, rebooting node in 1 minute'
/bin/bash -c "shutdown -r 1 &"
Expand Down
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/kubelet.service
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ExecStartPre=-/sbin/ebtables -t nat --list
ExecStartPre=-/sbin/iptables -t nat --numeric --list

ExecStartPre=/bin/bash /opt/azure/containers/validate-kubelet-credentials.sh
ExecStartPre=/bin/sh -c 'until [ -S /run/containerd/containerd.sock ]; do sleep 0.1; done'
Comment thread
awesomenix marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this to avoid kubelet going into a bad exponential back-off or something?


ExecStart=/opt/bin/kubelet \
--enable-server \
Expand Down
Loading
Loading