diff --git a/e2e/config/vhd.go b/e2e/config/vhd.go index f3bf2f573da..75351fe97cd 100644 --- a/e2e/config/vhd.go +++ b/e2e/config/vhd.go @@ -218,6 +218,15 @@ var ( OSDiskSizeGB: 60, } + VHDAzureLinuxV3Gen2Arm64 = &Image{ + Name: "azurelinuxv3gen2arm64", + OS: OSAzureLinux, + Arch: "arm64", + Distro: datamodel.AKSAzureLinuxV3Arm64Gen2, + Gallery: imageGalleryLinux, + OSDiskSizeGB: 60, + } + VHDACLGen2TL = &Image{ Name: "aclgen2TL", OS: OSACL, diff --git a/e2e/scenario_gpu_managed_experience_test.go b/e2e/scenario_gpu_managed_experience_test.go index 80cefed3e51..6971e7cfabd 100644 --- a/e2e/scenario_gpu_managed_experience_test.go +++ b/e2e/scenario_gpu_managed_experience_test.go @@ -190,6 +190,9 @@ func Test_DCGM_Exporter_Compatibility(t *testing.T) { t.Run(tc.name, func(t *testing.T) { RunScenario(t, &Scenario{ Description: tc.description, + Tags: Tags{ + GPU: true, + }, Config: Config{ Cluster: ClusterKubenet, VHD: tc.vhd, diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index a71a9c3e08e..a6d975f3781 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -106,6 +106,25 @@ func Test_Flatcar_ARM64(t *testing.T) { }) } +func Test_AzureLinuxV3_ARM64(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that a node using a AzureLinuxV3 VHD on ARM64 architecture can be properly bootstrapped", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDAzureLinuxV3Gen2Arm64, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" + nbc.IsARM64 = true + }, + Validator: func(ctx context.Context, s *Scenario) { + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_D2pds_V5") + }, + }, + }) +} + func Test_Flatcar_AzureCNI(t *testing.T) { RunScenario(t, &Scenario{ Description: "Flatcar scenario on a cluster configured with Azure CNI", diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index ca6629b5b40..334dba84cbf 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -164,20 +164,16 @@ configureCustomCaCertificate() { done # blocks until svc is considered active, which will happen when ExecStart command terminates with code 0 systemctl restart update_certs.service || exit $ERR_UPDATE_CA_CERTS - # containerd has to be restarted after new certs are added to the trust store, otherwise they will not be used until restart happens - systemctl restart containerd } configureContainerdUlimits() { CONTAINERD_ULIMIT_DROP_IN_FILE_PATH="/etc/systemd/system/containerd.service.d/set_ulimits.conf" + mkdir -p "$(dirname "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}")" touch "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" chmod 0600 "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" tee "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" > /dev/null < /etc/modules-load.d/br_netfilter.conf + echo -n "nf_conntrack" > /etc/modules-load.d/nf_conntrack.conf configureCNIIPTables } @@ -381,7 +378,7 @@ net.ipv6.conf.all.forwarding = 1 net.bridge.bridge-nf-call-iptables = 1 EOF retrycmd_if_failure 120 5 25 sysctl --system || exit $ERR_SYSCTL_RELOAD - systemctlEnableAndStart containerd 30 || exit $ERR_SYSTEMCTL_START_FAIL + systemctlEnableAndStartNoBlock containerd 30 || exit $ERR_SYSTEMCTL_START_FAIL } configureContainerdRegistryHost() { @@ -424,6 +421,7 @@ ensureNoDupOnPromiscuBridge() { } ensureArtifactStreaming() { + waitForContainerdReady || exit $ERR_ARTIFACT_STREAMING_INSTALL retrycmd_if_failure 120 5 25 time systemctl --quiet enable --now acr-mirror overlaybd-tcmu overlaybd-snapshotter time /opt/acr/bin/acr-config --enable-containerd 'azurecr.io' } @@ -566,6 +564,8 @@ ensurePodInfraContainerImage() { POD_INFRA_CONTAINER_IMAGE_DOWNLOAD_DIR="/opt/pod-infra-container-image/downloads" POD_INFRA_CONTAINER_IMAGE_TAR="/opt/pod-infra-container-image/pod-infra-container-image.tar" + waitForContainerdReady || exit $ERR_PULL_POD_INFRA_CONTAINER_IMAGE + pod_infra_container_image=$(get_sandbox_image) if [ -z "${pod_infra_container_image}" ]; then @@ -776,17 +776,22 @@ EOF logs_to_events "AKS.CSE.ensureKubelet.ensurePodInfraContainerImage" ensurePodInfraContainerImage fi - # start measure-tls-bootstrapping-latency.service without waiting for the main process to start, while ignoring any failures - if ! systemctlEnableAndStartNoBlock measure-tls-bootstrapping-latency 30; then - echo "failed to start measure-tls-bootstrapping-latency.service" - fi + local tls_bootstrapping_start_time_filepath="/opt/azure/containers/tls-bootstrap-start-time" + date +"%F %T.%3N" > "${tls_bootstrapping_start_time_filepath}" # start kubelet.service without waiting for the main process to start, though check whether it has entered a failed state after enablement if ! systemctlEnableAndStartNoBlock kubelet 240; then # append kubelet status to CSE output to ensure we can see it + rm -f "${tls_bootstrapping_start_time_filepath}" journalctl -u kubelet.service --no-pager || true exit $ERR_KUBELET_START_FAIL fi + + # start measure-tls-bootstrapping-latency.service without waiting for the main process to start, while ignoring any failures + if ! systemctlEnableAndStartNoBlock measure-tls-bootstrapping-latency 30; then + rm -f "${tls_bootstrapping_start_time_filepath}" + echo "failed to start measure-tls-bootstrapping-latency.service" + fi } ensureSnapshotUpdate() { @@ -923,6 +928,7 @@ configAzurePolicyAddon() { configGPUDrivers() { if [ "$OS" = "$UBUNTU_OS_NAME" ]; then + waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL mkdir -p /opt/{actions,gpu} ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index fe50af11d41..a6f2f6390d9 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -507,19 +507,20 @@ systemctlEnableAndStart() { service=$1; timeout=$2 systemctl_restart 100 5 $timeout $service RESTART_STATUS=$? - systemctl status $service --no-pager -l > /var/log/azure/$service-status.log if [ $RESTART_STATUS -ne 0 ]; then echo "$service could not be started" + systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true return 1 fi if ! retrycmd_if_failure 120 5 25 systemctl enable $service; then echo "$service could not be enabled by systemctl" + systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true return 1 fi } systemctlEnableAndStartNoBlock() { - service=$1; timeout=$2; status_check_delay_seconds=${3:-"0"} + service=$1; timeout=$2 systemctl_restart_no_block 100 5 $timeout $service RESTART_STATUS=$? @@ -534,21 +535,36 @@ systemctlEnableAndStartNoBlock() { systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true return 1 fi +} + +checkServiceHealth() { + local service=$1 + local state=$(systemctl show -p ActiveState --value "$service") + + if [ "$state" = "active" ]; then + return 0 + fi - # wait for the specified delay seconds before checking the service status to make sure - # it hasn't gone into a failed state - sleep $status_check_delay_seconds + systemctl status "$service" --no-pager -l > "/var/log/azure/$service-status.log" || true - if systemctl is-failed $service; then + if [ "$state" = "failed" ]; then echo "$service is in a failed state" - systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true return 1 + elif [ "$state" = "activating" ]; then + echo "$service is still activating, continuing anyway..." fi +} - # systemctl status only exits with code 0 iff the service is "active", - # thus we handle the "activating" case by checking for a non-zero exit code - if ! systemctl status $service --no-pager -l > /var/log/azure/$service-status.log; then - echo "$service is still activating, continuing anyway..." +waitForContainerdReady() { + local ret=0 + + echo "Waiting for containerd to become ready..." + retrycmd_if_failure 60 0.1 1 bash -c 'ctr version >/dev/null 2>&1' + ret=$? + if [ "$ret" -ne 0 ]; then + echo "containerd did not become ready" + systemctl status containerd --no-pager -l > /var/log/azure/containerd-status.log || true + return 1 fi } diff --git a/parts/linux/cloud-init/artifacts/cse_install.sh b/parts/linux/cloud-init/artifacts/cse_install.sh index 3f7871292a9..05827671fee 100755 --- a/parts/linux/cloud-init/artifacts/cse_install.sh +++ b/parts/linux/cloud-init/artifacts/cse_install.sh @@ -618,8 +618,12 @@ installKubeletKubectlFromURL() { fi fi fi - install -m0755 "/opt/bin/kubelet-${KUBERNETES_VERSION}" /opt/bin/kubelet - install -m0755 "/opt/bin/kubectl-${KUBERNETES_VERSION}" /opt/bin/kubectl + + mv "/opt/bin/kubelet-${KUBERNETES_VERSION}" /opt/bin/kubelet + mv "/opt/bin/kubectl-${KUBERNETES_VERSION}" /opt/bin/kubectl + + chown root:root /opt/bin/kubelet /opt/bin/kubectl + chmod 0755 /opt/bin/kubelet /opt/bin/kubectl rm -rf /opt/bin/kubelet-* /opt/bin/kubectl-* /home/hyperkube-downloads & } @@ -686,8 +690,9 @@ labelContainerImage() { } retagMCRImagesForChina() { + waitForContainerdReady || exit $ERR_CTR_OPERATION_ERROR # shellcheck disable=SC2016 - allMCRImages=($(ctr --namespace k8s.io images list | grep '^mcr.microsoft.com/' | awk '{print $1}')) + allMCRImages=($(ctr --namespace k8s.io images list | grep '^mcr.microsoft.com/' | awk '{print $1}')) if [ -z "${allMCRImages}" ]; then echo "failed to find mcr images for retag" return diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index 0225bfd0944..57d73a4a7a2 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -152,9 +152,13 @@ function basePrep { echo "Golden image; skipping dependencies installation" fi - # Container runtime already installed on Azure Linux OS Guard - if ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then + # Container runtime already installed on Azure Linux OS Guard; an explicit containerd override can bypass FULL_INSTALL_REQUIRED for other Linux distros + if isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then + echo "Skipping installContainerRuntime because containerd is already available" + elif [ "$FULL_INSTALL_REQUIRED" = "true" ] || [ -n "${CONTAINERD_PACKAGE_URL}" ]; then logs_to_events "AKS.CSE.installContainerRuntime" installContainerRuntime + else + echo "Skipping installContainerRuntime because containerd is already available" fi setupCNIDirs @@ -170,6 +174,9 @@ function basePrep { SHOULD_ENFORCE_KUBE_PMC_INSTALL=$(should_enforce_kube_pmc_install) logs_to_events "AKS.CSE.configureKubeletAndKubectl" configureKubeletAndKubectl + # pre-warm kubelet by checking its version. + nohup /bin/sh -c '/opt/bin/kubelet --version >/dev/null 2>&1' >/dev/null 2>&1 & + createKubeManifestDir if [ "${HAS_CUSTOM_SEARCH_DOMAIN}" = "true" ]; then @@ -194,6 +201,10 @@ function basePrep { logs_to_events "AKS.CSE.configureSystemdUseDomains" configureSystemdUseDomains fi + if [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" = "true" ]; then + logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits + fi + # containerd should not be configured until cni has been configured first logs_to_events "AKS.CSE.ensureContainerd" ensureContainerd @@ -268,14 +279,6 @@ EOF logs_to_events "AKS.CSE.ensureSysctl" ensureSysctl || exit $ERR_SYSCTL_RELOAD - if [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" = "true" ]; then - logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits - fi - - if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" = "true" ]; then - logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge - fi - if ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then if [ "$OS" = "$UBUNTU_OS_NAME" ] || isMarinerOrAzureLinux "$OS"; then logs_to_events "AKS.CSE.ubuntuSnapshotUpdate" ensureSnapshotUpdate @@ -351,11 +354,6 @@ function nodePrep { # By default, never reboot new nodes. REBOOTREQUIRED=false - # Clean up GPU drivers if not a GPU node or if skipping driver install - if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then - logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers - fi - # Install and configure GPU drivers if this is a GPU node if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then echo $(date),$(hostname), "Start configuring GPU drivers" @@ -484,10 +482,27 @@ function nodePrep { exit $VALIDATION_ERR fi + checkServiceHealth containerd || exit $ERR_SYSTEMCTL_START_FAIL + if [ "${ENABLE_SECURE_TLS_BOOTSTRAPPING}" = "true" ]; then + checkServiceHealth secure-tls-bootstrap || exit $ERR_SYSTEMCTL_START_FAIL + fi + logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet + if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" = "true" ]; then + logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge + fi + logs_to_events "AKS.CSE.configureNodeExporter" configureNodeExporter + + # Clean up GPU drivers if not a GPU node or if skipping driver install + if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then + logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers + fi + + checkServiceHealth kubelet || exit $ERR_KUBELET_FAIL + if $REBOOTREQUIRED; then echo 'reboot required, rebooting node in 1 minute' /bin/bash -c "shutdown -r 1 &" diff --git a/parts/linux/cloud-init/artifacts/kubelet.service b/parts/linux/cloud-init/artifacts/kubelet.service index 03662a5a35d..42b48afead7 100644 --- a/parts/linux/cloud-init/artifacts/kubelet.service +++ b/parts/linux/cloud-init/artifacts/kubelet.service @@ -21,6 +21,7 @@ ExecStartPre=-/sbin/ebtables -t nat --list ExecStartPre=-/sbin/iptables -t nat --numeric --list ExecStartPre=/bin/bash /opt/azure/containers/validate-kubelet-credentials.sh +ExecStartPre=/bin/sh -c 'until [ -S /run/containerd/containerd.sock ]; do sleep 0.1; done' ExecStart=/opt/bin/kubelet \ --enable-server \ diff --git a/parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.sh b/parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.sh index 8025f180388..c1c51cd1030 100644 --- a/parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.sh +++ b/parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.sh @@ -11,12 +11,14 @@ EVENTS_LOGGING_DIR=/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events KUBECONFIG_PATH="${KUBECONFIG_PATH:-/var/lib/kubelet/kubeconfig}" KUBECONFIG_DIR="$(dirname "$KUBECONFIG_PATH")" +TLS_BOOTSTRAPPING_START_TIME_FILEPATH="${TLS_BOOTSTRAPPING_START_TIME_FILEPATH:-/opt/azure/containers/tls-bootstrap-start-time}" WATCH_TIMEOUT_SECONDS=${WATCH_TIMEOUT_SECONDS:-300} # default to 5 minutes createGuestAgentEvent() { local task=$1; startTime=$2; endTime=$3; - local eventsFileName=$(date +%s%3N) + local eventsFileName + eventsFileName=$(date +%s%3N) json_string=$( jq -n \ --arg Timestamp "${startTime}" \ @@ -29,7 +31,14 @@ createGuestAgentEvent() { --arg EventTid "0" \ '{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}' ) - echo ${json_string} > ${EVENTS_LOGGING_DIR}${eventsFileName}.json + echo "${json_string}" > "${EVENTS_LOGGING_DIR}${eventsFileName}.json" +} + +emitTLSBootstrappingCompletedEvent() { + local start_time=$1 + local end_time + end_time=$(date +"%F %T.%3N") + createGuestAgentEvent "AKS.Runtime.waitForTLSBootstrapping" "$start_time" "$end_time" } waitForTLSBootstrapping() { @@ -39,36 +48,43 @@ waitForTLSBootstrapping() { exit 0 fi + if [ ! -s "$TLS_BOOTSTRAPPING_START_TIME_FILEPATH" ]; then + echo "TLS bootstrapping start time file not found at: $TLS_BOOTSTRAPPING_START_TIME_FILEPATH" + exit 0 + fi + + START_TIME=$(cat "$TLS_BOOTSTRAPPING_START_TIME_FILEPATH") + trap 'rm -f "$TLS_BOOTSTRAPPING_START_TIME_FILEPATH"' EXIT + # ensure the kubeconfig dir exists mkdir -p "$KUBECONFIG_DIR" - # check if a kubeconfig already exists, in which case there's nothing to wait for or measure + # If kubeconfig already exists, kubelet finished TLS bootstrapping before this service started. + # Emit the completed event using the start time written immediately before kubelet startup. if [ -f "$KUBECONFIG_PATH" ]; then echo "kubeconfig already exists at: $KUBECONFIG_PATH" + emitTLSBootstrappingCompletedEvent "$START_TIME" exit 0 fi echo "watching for kubeconfig to be created at $KUBECONFIG_PATH with ${WATCH_TIMEOUT_SECONDS}s timeout..." - START_TIME=$(date +"%F %T.%3N") - inotifywait -t $WATCH_TIMEOUT_SECONDS -qme create "$KUBECONFIG_DIR" | while read -r DIR EVENT FILE; do + inotifywait -t "$WATCH_TIMEOUT_SECONDS" -qme create "$KUBECONFIG_DIR" | while read -r DIR EVENT FILE; do if [ "${EVENT,,}" = "create" ] && [ "${DIR}${FILE}" = "$KUBECONFIG_PATH" ]; then - END_TIME=$(date +"%F %T.%3N") echo "new kubeconfig created at: $KUBECONFIG_PATH" - # we only create the guest agent event if the certificate was created while we were watching - createGuestAgentEvent "AKS.Runtime.waitForTLSBootstrapping" "$START_TIME" "$END_TIME" + emitTLSBootstrappingCompletedEvent "$START_TIME" # this is ugly, but it's the best way to ensure that we don't leave inotifywait running in the background consuming resources kill -- -$$ fi done - # check once more if there was a kubeconfig created after finishing the inotifywait loop - # to avoid data skewing, we don't emit a guest agent event in this case - # this would only happen if we hit a race condition between first checking kubeconfig existence and starting inotifywait + # Check once more in case kubeconfig was created after the initial existence check but before + # inotifywait started listening. This preserves the latency signal for fast kubelet startups. if [ -f "$KUBECONFIG_PATH" ]; then echo "kubeconfig now exists at: $KUBECONFIG_PATH" + emitTLSBootstrappingCompletedEvent "$START_TIME" else END_TIME=$(date +"%F %T.%3N") echo "kubeconfig was not created after ${WATCH_TIMEOUT_SECONDS}s" diff --git a/spec/parts/linux/cloud-init/artifacts/measure_tls_bootstrapping_latency_spec.sh b/spec/parts/linux/cloud-init/artifacts/measure_tls_bootstrapping_latency_spec.sh index 5a07e7a1fa5..c983018d1f1 100644 --- a/spec/parts/linux/cloud-init/artifacts/measure_tls_bootstrapping_latency_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/measure_tls_bootstrapping_latency_spec.sh @@ -5,6 +5,7 @@ Describe 'measure-tls-bootstrapping-latency.sh' KUBECONFIG_PATH="spec-test/kubeconfig" KUBECONFIG_DIR="$(dirname "$KUBECONFIG_PATH")" + TLS_BOOTSTRAPPING_START_TIME_FILEPATH="spec-test/tls-bootstrap-start-time" WATCH_TIMEOUT_SECONDS=3 createGuestAgentEvent() { @@ -16,8 +17,12 @@ Describe 'measure-tls-bootstrapping-latency.sh' kill() { echo "kill $@" } + writeStartTimeFile() { + mkdir -p "$(dirname "$TLS_BOOTSTRAPPING_START_TIME_FILEPATH")" + echo "2026-03-17 00:00:00.000" > "$TLS_BOOTSTRAPPING_START_TIME_FILEPATH" + } cleanup() { - rm -rf $KUBECONFIG_DIR + rm -rf $KUBECONFIG_DIR "$(dirname "$TLS_BOOTSTRAPPING_START_TIME_FILEPATH")" } AfterEach 'cleanup' @@ -32,16 +37,25 @@ Describe 'measure-tls-bootstrapping-latency.sh' The status should be success End + It 'should exit 0 if TLS bootstrapping start time file does not exist' + When run waitForTLSBootstrapping + The stdout should include 'TLS bootstrapping start time file not found at: spec-test/tls-bootstrap-start-time' + The status should be success + End + It 'should exit 0 if KUBECONFIG_PATH already exists' + writeStartTimeFile mkdir -p "$(dirname "$KUBECONFIG_PATH")" touch $KUBECONFIG_PATH When run waitForTLSBootstrapping The stdout should include 'kubeconfig already exists at: spec-test/kubeconfig' + The stdout should include 'createGuestAgentEvent AKS.Runtime.waitForTLSBootstrapping 2026-03-17 00:00:00.000' The status should be success End It 'should create a guest agent event when a kubeconfig is created' + writeStartTimeFile inotifywait() { echo "$KUBECONFIG_DIR/ CREATE kubeconfig" return 0 @@ -56,6 +70,7 @@ Describe 'measure-tls-bootstrapping-latency.sh' End It 'should not create a guest agent event if a kubeconfig is never created' + writeStartTimeFile inotifywait() { echo "$KUBECONFIG_DIR/ CREATE other-file" return 0 @@ -70,6 +85,7 @@ Describe 'measure-tls-bootstrapping-latency.sh' End It 'should not create a guest agent event if inotifywait times out without observing anything' + writeStartTimeFile inotifywait() { sleep $WATCH_TIMEOUT_SECONDS return 0 @@ -83,7 +99,8 @@ Describe 'measure-tls-bootstrapping-latency.sh' The status should be success End - It 'should not create a guest agent event if kubeconfig creation was never observed, but did occur due to race condition' + It 'should create a guest agent event if kubeconfig creation was never observed, but did occur due to race condition' + writeStartTimeFile inotifywait() { touch $KUBECONFIG_PATH echo "$KUBECONFIG_DIR/ CREATE other-file" @@ -93,8 +110,8 @@ Describe 'measure-tls-bootstrapping-latency.sh' When run waitForTLSBootstrapping The stdout should include 'watching for kubeconfig to be created at spec-test/kubeconfig with 3s timeout...' The stdout should include 'kubeconfig now exists at: spec-test/kubeconfig' - The stdout should not include 'createGuestAgentEvent AKS.Runtime.waitForTLSBootstrapping' + The stdout should include 'createGuestAgentEvent AKS.Runtime.waitForTLSBootstrapping 2026-03-17 00:00:00.000' The stdout should not include 'kill -- -' The status should be success End -End \ No newline at end of file +End diff --git a/vhdbuilder/packer/cleanup-vhd.sh b/vhdbuilder/packer/cleanup-vhd.sh index 6581ddd5011..e0b60c3ae63 100644 --- a/vhdbuilder/packer/cleanup-vhd.sh +++ b/vhdbuilder/packer/cleanup-vhd.sh @@ -1,5 +1,8 @@ #!/bin/bash -eux +systemctl daemon-reload +systemctl disable --now containerd + # Cleanup packer SSH key and machine ID generated for this boot rm -f /root/.ssh/authorized_keys rm -f /home/packer/.ssh/authorized_keys diff --git a/vhdbuilder/packer/test/linux-vhd-content-test.sh b/vhdbuilder/packer/test/linux-vhd-content-test.sh index 4eb37245bd9..bb76a887fcf 100644 --- a/vhdbuilder/packer/test/linux-vhd-content-test.sh +++ b/vhdbuilder/packer/test/linux-vhd-content-test.sh @@ -19,6 +19,8 @@ IMG_SKU="$5" FEATURE_FLAGS="$6" GIT_COMMIT_HASH="$7" +systemctl daemon-reload && systemctl restart containerd + # List of "ERROR/WARNING" message we want to ignore in the cloud-init.log # 1. "Command ['hostname', '-f']": # Running hostname -f will fail on current AzureLinux AKS image. We don't not have active plan to resolve diff --git a/vhdbuilder/packer/trivy-scan.sh b/vhdbuilder/packer/trivy-scan.sh index a430c217bc3..9e558531a15 100644 --- a/vhdbuilder/packer/trivy-scan.sh +++ b/vhdbuilder/packer/trivy-scan.sh @@ -46,6 +46,7 @@ CVE_DIFF_UPLOAD_REPORT_NAME=${30} CVE_LIST_UPLOAD_REPORT_NAME=${31} SCAN_RESOURCE_PREFIX=${32} +source /opt/azure/containers/provision_source.sh source /opt/azure/containers/provision_source_distro.sh retrycmd_if_failure() { @@ -144,6 +145,8 @@ else exit 1 fi +systemctlEnableAndStart containerd 30 || exit 4 + mkdir -p "$(dirname "${TRIVY_REPORT_DIRNAME}")" curl -fL -o "trivy_${TRIVY_VERSION}_${TRIVY_ARCH}.tar.gz" "https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/trivy_${TRIVY_VERSION}_${TRIVY_ARCH}.tar.gz"