From 54bc81c29d0363f2d728bf008304a62de9ed1439 Mon Sep 17 00:00:00 2001 From: Sonic Build Admin Date: Fri, 30 Jan 2026 07:03:02 +0000 Subject: [PATCH] [Nvidia-Bluefield] Change sonic-bfb-installer reboot flow to fix pmon sensor errors #### Why I did it Fix transient errors during bfb install on smartswitch platform. ``` ERR pmon#sensord: Error getting sensor data: mp2975/#16: Kernel interface error ``` ##### Work item tracking - Microsoft ADO **(number only)**: #### How I did it Use pre-shutdown procedures before doing a reboot #### How to verify it Installation of bfb image on dpu from switch shouldn't cause errors #### Which release branch to backport (provide reason below if selected) - [ ] 202205 - [ ] 202211 - [ ] 202305 - [ ] 202311 - [ ] 202405 - [ ] 202411 - [ ] 202505 #### Tested branch (Please provide the tested image version) - [ ] - [ ] #### Description for the changelog #### Link to config_db schema for YANG module changes #### A picture of a cute animal (not mandatory but encouraged) --- platform/mellanox/sonic-bfb-installer.sh | 51 +++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/platform/mellanox/sonic-bfb-installer.sh b/platform/mellanox/sonic-bfb-installer.sh index 8f25f11acd3..74098088c6c 100755 --- a/platform/mellanox/sonic-bfb-installer.sh +++ b/platform/mellanox/sonic-bfb-installer.sh @@ -78,6 +78,10 @@ PLATFORM_JSON=/usr/share/sonic/device/$PLATFORM/platform.json declare -A rshim2dpu declare -r bfsoc_dev_id="15b3:c2d5" declare -r cx7_dev_id="15b3:a2dc" +declare -r REBOOT_HELPER_SCRIPT="/usr/local/bin/reboot_smartswitch_helper" + +# Source reboot helper script at initialization if available +[[ -f "$REBOOT_HELPER_SCRIPT" ]] && source "$REBOOT_HELPER_SCRIPT" # Local functions to replace dpumap.sh rshim2dpu_map() { @@ -199,6 +203,47 @@ remove_cx_pci_device() { fi } +# Function to check if CHASSIS_MODULE_TABLE entry exists for a specific DPU +is_chassis_module_table_present() { + local dpu_name=$1 + local output + output=$(sonic-db-cli STATE_DB KEYS "CHASSIS_MODULE_TABLE|${dpu_name}" 2>/dev/null) + if [[ -z "$output" ]]; then + return 1 + fi + return 0 +} + +# Function to execute dpuctl dpu-reset command +run_dpuctl_reset() { + local dpu=$1 + local use_verbose=$2 + local reset_cmd="dpuctl dpu-reset --force $dpu" + if [[ "$use_verbose" == true ]]; then + reset_cmd="$reset_cmd -v" + fi + eval $reset_cmd +} + +# Function to reset DPU using reboot helper or fallback to dpuctl +reset_dpu() { + local dpu=$1 + local use_verbose=$2 + local dpu_upper="${dpu^^}" # Convert to uppercase (e.g., dpu0 -> DPU0) + + # Check if reboot helper is available and CHASSIS_MODULE_TABLE entry exists for this DPU + if [[ -f "$REBOOT_HELPER_SCRIPT" ]] && is_chassis_module_table_present "$dpu_upper"; then + log_info "Using reboot helper pre/post shutdown methods while resetting $dpu_upper" + module_pre_shutdown "$dpu_upper" + run_dpuctl_reset "$dpu" "$use_verbose" + module_post_startup "$dpu_upper" + else + # Fallback to dpuctl + log_info "Using dpuctl to reset $dpu" + run_dpuctl_reset "$dpu" "$use_verbose" + fi +} + monitor_installation() { local -r rid=$1 local -r pid=$2 @@ -311,11 +356,7 @@ bfb_install_call() { stop_rshim_daemon "$rid" log_info "$rid: Resetting DPU $dpu" - local reset_cmd="dpuctl dpu-reset --force $dpu" - if [[ $verbose == true ]]; then - reset_cmd="$reset_cmd -v" - fi - eval $reset_cmd + reset_dpu "$dpu" "$verbose" } file_cleanup(){