-
Notifications
You must be signed in to change notification settings - Fork 266
feat: improve cse bootstrap latency by deferring non-critical work #8105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,20 +164,16 @@ configureCustomCaCertificate() { | |
| done | ||
| # blocks until svc is considered active, which will happen when ExecStart command terminates with code 0 | ||
| systemctl restart update_certs.service || exit $ERR_UPDATE_CA_CERTS | ||
| # containerd has to be restarted after new certs are added to the trust store, otherwise they will not be used until restart happens | ||
| systemctl restart containerd | ||
| } | ||
|
|
||
| configureContainerdUlimits() { | ||
| CONTAINERD_ULIMIT_DROP_IN_FILE_PATH="/etc/systemd/system/containerd.service.d/set_ulimits.conf" | ||
| mkdir -p "$(dirname "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}")" | ||
| touch "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" | ||
| chmod 0600 "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" | ||
| tee "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" > /dev/null <<EOF | ||
| $(echo "$CONTAINERD_ULIMITS" | tr ' ' '\n') | ||
| EOF | ||
|
awesomenix marked this conversation as resolved.
|
||
|
|
||
| systemctl daemon-reload | ||
| systemctl restart containerd | ||
| } | ||
|
|
||
| # file paths defined outside so configureAzureJson can be unit tested | ||
|
|
@@ -301,9 +297,10 @@ EOF | |
| } | ||
|
|
||
| configureCNI() { | ||
| # needed for the iptables rules to work on bridges | ||
| retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit $ERR_MODPROBE_FAIL | ||
| # needed for bridge iptables rules and customer-configured conntrack sysctls | ||
| retrycmd_if_failure 120 5 25 modprobe -a br_netfilter nf_conntrack || exit $ERR_MODPROBE_FAIL | ||
| echo -n "br_netfilter" > /etc/modules-load.d/br_netfilter.conf | ||
| echo -n "nf_conntrack" > /etc/modules-load.d/nf_conntrack.conf | ||
| configureCNIIPTables | ||
| } | ||
|
|
||
|
|
@@ -381,7 +378,7 @@ net.ipv6.conf.all.forwarding = 1 | |
| net.bridge.bridge-nf-call-iptables = 1 | ||
| EOF | ||
| retrycmd_if_failure 120 5 25 sysctl --system || exit $ERR_SYSCTL_RELOAD | ||
| systemctlEnableAndStart containerd 30 || exit $ERR_SYSTEMCTL_START_FAIL | ||
| systemctlEnableAndStartNoBlock containerd 30 || exit $ERR_SYSTEMCTL_START_FAIL | ||
|
awesomenix marked this conversation as resolved.
|
||
| } | ||
|
|
||
| configureContainerdRegistryHost() { | ||
|
|
@@ -424,6 +421,7 @@ ensureNoDupOnPromiscuBridge() { | |
| } | ||
|
|
||
| ensureArtifactStreaming() { | ||
| waitForContainerdReady || exit $ERR_ARTIFACT_STREAMING_INSTALL | ||
| retrycmd_if_failure 120 5 25 time systemctl --quiet enable --now acr-mirror overlaybd-tcmu overlaybd-snapshotter | ||
| time /opt/acr/bin/acr-config --enable-containerd 'azurecr.io' | ||
| } | ||
|
|
@@ -566,6 +564,8 @@ ensurePodInfraContainerImage() { | |
| POD_INFRA_CONTAINER_IMAGE_DOWNLOAD_DIR="/opt/pod-infra-container-image/downloads" | ||
| POD_INFRA_CONTAINER_IMAGE_TAR="/opt/pod-infra-container-image/pod-infra-container-image.tar" | ||
|
|
||
| waitForContainerdReady || exit $ERR_PULL_POD_INFRA_CONTAINER_IMAGE | ||
|
|
||
| pod_infra_container_image=$(get_sandbox_image) | ||
|
|
||
| if [ -z "${pod_infra_container_image}" ]; then | ||
|
|
@@ -776,17 +776,22 @@ EOF | |
| logs_to_events "AKS.CSE.ensureKubelet.ensurePodInfraContainerImage" ensurePodInfraContainerImage | ||
| fi | ||
|
|
||
| # start measure-tls-bootstrapping-latency.service without waiting for the main process to start, while ignoring any failures | ||
| if ! systemctlEnableAndStartNoBlock measure-tls-bootstrapping-latency 30; then | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this really adding a lot of latency? mainly asking since moving this below ensureKubelet seems to add a fair bit of complexity |
||
| echo "failed to start measure-tls-bootstrapping-latency.service" | ||
| fi | ||
| local tls_bootstrapping_start_time_filepath="/opt/azure/containers/tls-bootstrap-start-time" | ||
| date +"%F %T.%3N" > "${tls_bootstrapping_start_time_filepath}" | ||
|
|
||
| # start kubelet.service without waiting for the main process to start, though check whether it has entered a failed state after enablement | ||
| if ! systemctlEnableAndStartNoBlock kubelet 240; then | ||
| # append kubelet status to CSE output to ensure we can see it | ||
| rm -f "${tls_bootstrapping_start_time_filepath}" | ||
| journalctl -u kubelet.service --no-pager || true | ||
| exit $ERR_KUBELET_START_FAIL | ||
| fi | ||
|
|
||
| # start measure-tls-bootstrapping-latency.service without waiting for the main process to start, while ignoring any failures | ||
| if ! systemctlEnableAndStartNoBlock measure-tls-bootstrapping-latency 30; then | ||
| rm -f "${tls_bootstrapping_start_time_filepath}" | ||
| echo "failed to start measure-tls-bootstrapping-latency.service" | ||
| fi | ||
|
awesomenix marked this conversation as resolved.
|
||
| } | ||
|
|
||
| ensureSnapshotUpdate() { | ||
|
|
@@ -923,6 +928,7 @@ configAzurePolicyAddon() { | |
|
|
||
| configGPUDrivers() { | ||
| if [ "$OS" = "$UBUNTU_OS_NAME" ]; then | ||
| waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL | ||
| mkdir -p /opt/{actions,gpu} | ||
| ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG | ||
| retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -507,19 +507,20 @@ systemctlEnableAndStart() { | |
| service=$1; timeout=$2 | ||
| systemctl_restart 100 5 $timeout $service | ||
| RESTART_STATUS=$? | ||
| systemctl status $service --no-pager -l > /var/log/azure/$service-status.log | ||
| if [ $RESTART_STATUS -ne 0 ]; then | ||
| echo "$service could not be started" | ||
| systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true | ||
| return 1 | ||
| fi | ||
| if ! retrycmd_if_failure 120 5 25 systemctl enable $service; then | ||
| echo "$service could not be enabled by systemctl" | ||
| systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true | ||
| return 1 | ||
| fi | ||
| } | ||
|
|
||
| systemctlEnableAndStartNoBlock() { | ||
| service=$1; timeout=$2; status_check_delay_seconds=${3:-"0"} | ||
| service=$1; timeout=$2 | ||
|
|
||
| systemctl_restart_no_block 100 5 $timeout $service | ||
| RESTART_STATUS=$? | ||
|
|
@@ -534,21 +535,36 @@ systemctlEnableAndStartNoBlock() { | |
| systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true | ||
| return 1 | ||
| fi | ||
| } | ||
|
Comment on lines
522
to
+538
|
||
|
|
||
| checkServiceHealth() { | ||
| local service=$1 | ||
| local state=$(systemctl show -p ActiveState --value "$service") | ||
|
|
||
| if [ "$state" = "active" ]; then | ||
| return 0 | ||
| fi | ||
|
|
||
| # wait for the specified delay seconds before checking the service status to make sure | ||
| # it hasn't gone into a failed state | ||
| sleep $status_check_delay_seconds | ||
| systemctl status "$service" --no-pager -l > "/var/log/azure/$service-status.log" || true | ||
|
|
||
| if systemctl is-failed $service; then | ||
| if [ "$state" = "failed" ]; then | ||
| echo "$service is in a failed state" | ||
| systemctl status $service --no-pager -l > /var/log/azure/$service-status.log || true | ||
| return 1 | ||
| elif [ "$state" = "activating" ]; then | ||
| echo "$service is still activating, continuing anyway..." | ||
| fi | ||
| } | ||
|
Devinwong marked this conversation as resolved.
|
||
|
|
||
| # systemctl status only exits with code 0 iff the service is "active", | ||
| # thus we handle the "activating" case by checking for a non-zero exit code | ||
| if ! systemctl status $service --no-pager -l > /var/log/azure/$service-status.log; then | ||
| echo "$service is still activating, continuing anyway..." | ||
| waitForContainerdReady() { | ||
| local ret=0 | ||
|
|
||
| echo "Waiting for containerd to become ready..." | ||
| retrycmd_if_failure 60 0.1 1 bash -c 'ctr version >/dev/null 2>&1' | ||
| ret=$? | ||
| if [ "$ret" -ne 0 ]; then | ||
| echo "containerd did not become ready" | ||
| systemctl status containerd --no-pager -l > /var/log/azure/containerd-status.log || true | ||
| return 1 | ||
| fi | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -152,9 +152,13 @@ function basePrep { | |
| echo "Golden image; skipping dependencies installation" | ||
| fi | ||
|
|
||
| # Container runtime already installed on Azure Linux OS Guard | ||
| if ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then | ||
| # Container runtime already installed on Azure Linux OS Guard; an explicit containerd override can bypass FULL_INSTALL_REQUIRED for other Linux distros | ||
| if isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then | ||
| echo "Skipping installContainerRuntime because containerd is already available" | ||
| elif [ "$FULL_INSTALL_REQUIRED" = "true" ] || [ -n "${CONTAINERD_PACKAGE_URL}" ]; then | ||
| logs_to_events "AKS.CSE.installContainerRuntime" installContainerRuntime | ||
| else | ||
| echo "Skipping installContainerRuntime because containerd is already available" | ||
| fi | ||
| setupCNIDirs | ||
|
|
||
|
|
@@ -170,6 +174,9 @@ function basePrep { | |
| SHOULD_ENFORCE_KUBE_PMC_INSTALL=$(should_enforce_kube_pmc_install) | ||
| logs_to_events "AKS.CSE.configureKubeletAndKubectl" configureKubeletAndKubectl | ||
|
|
||
| # pre-warm kubelet by checking its version. | ||
| nohup /bin/sh -c '/opt/bin/kubelet --version >/dev/null 2>&1' >/dev/null 2>&1 & | ||
|
|
||
| createKubeManifestDir | ||
|
|
||
| if [ "${HAS_CUSTOM_SEARCH_DOMAIN}" = "true" ]; then | ||
|
|
@@ -194,6 +201,10 @@ function basePrep { | |
| logs_to_events "AKS.CSE.configureSystemdUseDomains" configureSystemdUseDomains | ||
| fi | ||
|
|
||
| if [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits | ||
| fi | ||
|
|
||
| # containerd should not be configured until cni has been configured first | ||
| logs_to_events "AKS.CSE.ensureContainerd" ensureContainerd | ||
|
|
||
|
|
@@ -268,14 +279,6 @@ EOF | |
|
|
||
| logs_to_events "AKS.CSE.ensureSysctl" ensureSysctl || exit $ERR_SYSCTL_RELOAD | ||
|
|
||
| if [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits | ||
| fi | ||
|
|
||
| if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge | ||
| fi | ||
|
|
||
| if ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then | ||
| if [ "$OS" = "$UBUNTU_OS_NAME" ] || isMarinerOrAzureLinux "$OS"; then | ||
| logs_to_events "AKS.CSE.ubuntuSnapshotUpdate" ensureSnapshotUpdate | ||
|
|
@@ -351,11 +354,6 @@ function nodePrep { | |
| # By default, never reboot new nodes. | ||
| REBOOTREQUIRED=false | ||
|
|
||
| # Clean up GPU drivers if not a GPU node or if skipping driver install | ||
| if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers | ||
| fi | ||
|
|
||
| # Install and configure GPU drivers if this is a GPU node | ||
| if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then | ||
| echo $(date),$(hostname), "Start configuring GPU drivers" | ||
|
|
@@ -484,10 +482,27 @@ function nodePrep { | |
| exit $VALIDATION_ERR | ||
| fi | ||
|
|
||
| checkServiceHealth containerd || exit $ERR_SYSTEMCTL_START_FAIL | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: it would be better if we had specific exit codes here, that way we can always pinpoint exactly which call to checkServiceHealth is failing |
||
| if [ "${ENABLE_SECURE_TLS_BOOTSTRAPPING}" = "true" ]; then | ||
| checkServiceHealth secure-tls-bootstrap || exit $ERR_SYSTEMCTL_START_FAIL | ||
| fi | ||
|
|
||
| logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we teach AI in the PR that it should report when changes are introduced before ensure Kubelet which will have direct impact on nodebootstrapping time ? |
||
|
|
||
| if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. weird that the service file has
in this case, I'm not sure if calling the enable/start will have an effect, not that we actually start containd/kubelet before ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes |
||
| fi | ||
|
awesomenix marked this conversation as resolved.
|
||
|
|
||
| logs_to_events "AKS.CSE.configureNodeExporter" configureNodeExporter | ||
|
|
||
|
|
||
| # Clean up GPU drivers if not a GPU node or if skipping driver install | ||
| if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers | ||
| fi | ||
|
|
||
| checkServiceHealth kubelet || exit $ERR_KUBELET_FAIL | ||
|
|
||
| if $REBOOTREQUIRED; then | ||
| echo 'reboot required, rebooting node in 1 minute' | ||
| /bin/bash -c "shutdown -r 1 &" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ ExecStartPre=-/sbin/ebtables -t nat --list | |
| ExecStartPre=-/sbin/iptables -t nat --numeric --list | ||
|
|
||
| ExecStartPre=/bin/bash /opt/azure/containers/validate-kubelet-credentials.sh | ||
| ExecStartPre=/bin/sh -c 'until [ -S /run/containerd/containerd.sock ]; do sleep 0.1; done' | ||
|
awesomenix marked this conversation as resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this to avoid kubelet going into a bad exponential back-off or something? |
||
|
|
||
| ExecStart=/opt/bin/kubelet \ | ||
| --enable-server \ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.