Skip to content

Commit 2d228c1

Browse files
committed
[SmartSwitch] add graceful shutdown/startup utilities and visibility
<!-- Please make sure you've read and understood our contributing guidelines: https://github.com/Azure/SONiC/blob/gh-pages/CONTRIBUTING.md ** Make sure all your commits include a signature generated with `git commit -s` ** If this is a bug fix, make sure your description includes "closes #xxxx", "fixes #xxxx" or "resolves #xxxx" so that GitHub automatically closes the related issue when the PR is merged. If you are adding/modifying/removing any command or utility script, please also make sure to add/modify/remove any unit tests from the tests directory as appropriate. If you are modifying or removing an existing 'show', 'config' or 'sonic-clear' subcommand, or you are adding a new subcommand, please make sure you also update the Command Line Reference Guide (doc/Command-Reference.md) to reflect your changes. Please provide the following information: --> HLD: https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/graceful-shutdown/graceful-shutdown.md These changes build upon enhancements in sonic-net#4031 This PR adds CLI support and visibility for module-level graceful transitions (startup/shutdown/reboot) to align with the SmartSwitch/DPU lifecycle work. #### What I did - Added support to view module transition states (startup, shutdown, reboot) through CLI. - Integrated with STATE_DB CHASSIS_MODULE_TABLE to display transition status, type, and elapsed time. - Enhanced user experience with readable durations and exit codes for automation. - Implemented comprehensive unit tests for transition visibility, parsing, and error handling. #### How I did it - Added a helper class to read STATE_DB entries: - state_transition_in_progress - transition_type - transition_start_time - Implemented robust error handling for missing or malformed DB entries. - Added pytest-based unit tests using mocked state_db_connector. #### How to verify it - Build and install the updated sonic-utilities package on DUT - Check Redis entries: `redis-cli -n 6 hgetall "CHASSIS_MODULE_TABLE|DPU0"` - Run the module startup/shutdown commands - Run unit tests #### Sample outputs when "state_transition_in_progress" Errors thrown when the same module transition is already in progress. $ sudo config chassis modules shutdown DPU2;redis-cli -n 6 hgetall 'CHASSIS_MODULE_TABLE|DPU2';sudo reboot -d DPU2;redis-cli -n 6 hgetall 'CHASSIS_MODULE_TABLE|DPU2' Shutting down chassis module DPU2 1) "desc" 2) "NVIDIA XXXXXX DPU" 3) "slot" 4) "N/A" 5) "oper_status" 6) "Online" 7) "serial" 8) "XXXXXXXXXX" 9) "transition_in_progress" 10) "True" 11) "transition_type" 12) "shutdown" 13) "transition_start_time" 14) "1763059401" True 2025-11-13 18:43:22 - User requested rebooting device dpu2 ... 2025-11-13 18:43:23 - INFO: DPU dpu2 is in 'Online' state before reboot. 2025-11-13 18:43:23 - ERROR: state_transition_in_progress flag is already set for dpu2 #### Previous command output (if the output of a command-line utility has changed) #### New command output (if the output of a command-line utility has changed) $ reboot -d DPU1 True 2025-11-17 17:56:10 - User requested rebooting device dpu1 ... 2025-11-17 17:56:11 - INFO: DPU dpu1 is in 'Online' state before reboot. 2025-11-17 17:56:12 - INFO: Rebooting dpu1, ip:1X9.XXX.X00.2 gnmi_port:50XXX 2025-11-17 17:56:53 - INFO: dpu1 halted the services successfully 2025-11-17 17:58:50 - INFO: Rebooting dpu1 with reboot_type:DPU...
1 parent b056ba1 commit 2d228c1

5 files changed

Lines changed: 345 additions & 187 deletions

File tree

config/chassis_modules.py

Lines changed: 9 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import subprocess
77
import utilities_common.cli as clicommon
88
from utilities_common.chassis import is_smartswitch, get_all_dpus
9+
from utilities_common.module import ModuleHelper
910
from datetime import datetime, timedelta, timezone
1011

1112
TIMEOUT_SECS = 10
@@ -64,50 +65,6 @@ def get_config_module_state(db, chassis_module_name):
6465
else:
6566
return fvs['admin_status']
6667

67-
68-
def get_state_transition_in_progress(db, chassis_module_name):
69-
ensure_statedb_connected(db)
70-
fvs = db.statedb.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
71-
value = fvs.get('state_transition_in_progress', 'False') if fvs else 'False'
72-
return value
73-
74-
75-
def set_state_transition_in_progress(db, chassis_module_name, value):
76-
ensure_statedb_connected(db)
77-
state_db = db.statedb
78-
entry = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) or {}
79-
entry['state_transition_in_progress'] = value
80-
if value == 'True':
81-
entry['transition_start_time'] = datetime.now(timezone.utc).isoformat()
82-
else:
83-
# Remove transition_start_time from both local entry and database
84-
entry.pop('transition_start_time', None)
85-
state_db.delete_field('CHASSIS_MODULE_TABLE', chassis_module_name, 'transition_start_time')
86-
state_db.set_entry('CHASSIS_MODULE_TABLE', chassis_module_name, entry)
87-
88-
89-
def is_transition_timed_out(db, chassis_module_name):
90-
ensure_statedb_connected(db)
91-
state_db = db.statedb
92-
fvs = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
93-
if not fvs:
94-
return False
95-
start_time_str = fvs.get('transition_start_time')
96-
if not start_time_str:
97-
return False
98-
try:
99-
start_time = datetime.fromisoformat(start_time_str)
100-
except ValueError:
101-
return False
102-
103-
# Use UTC everywhere for consistent comparison
104-
current_time = datetime.now(timezone.utc)
105-
if start_time.tzinfo is None:
106-
# If stored time is naive, assume it's UTC
107-
start_time = start_time.replace(tzinfo=timezone.utc)
108-
109-
return current_time - start_time > TRANSITION_TIMEOUT
110-
11168
#
11269
# Name: check_config_module_state_with_timeout
11370
# return: True: timeout, False: not timeout
@@ -196,15 +153,10 @@ def shutdown_chassis_module(db, chassis_module_name):
196153
return
197154

198155
if is_smartswitch():
199-
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
200-
if is_transition_timed_out(db, chassis_module_name):
201-
set_state_transition_in_progress(db, chassis_module_name, 'False')
202-
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with shutdown.")
203-
else:
204-
click.echo(f"Module {chassis_module_name} state transition is already in progress")
205-
return
206-
else:
207-
set_state_transition_in_progress(db, chassis_module_name, 'True')
156+
module_helper = ModuleHelper()
157+
if module_helper.get_module_state_transition(chassis_module_name):
158+
click.echo(f"Module {chassis_module_name} state transition is already in progress")
159+
return
208160

209161
click.echo(f"Shutting down chassis module {chassis_module_name}")
210162
fvs = {
@@ -243,15 +195,10 @@ def startup_chassis_module(db, chassis_module_name):
243195
return
244196

245197
if is_smartswitch():
246-
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
247-
if is_transition_timed_out(db, chassis_module_name):
248-
set_state_transition_in_progress(db, chassis_module_name, 'False')
249-
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with startup.")
250-
else:
251-
click.echo(f"Module {chassis_module_name} state transition is already in progress")
252-
return
253-
else:
254-
set_state_transition_in_progress(db, chassis_module_name, 'True')
198+
module_helper = ModuleHelper()
199+
if module_helper.get_module_state_transition(chassis_module_name):
200+
click.echo(f"Module {chassis_module_name} state transition is already in progress")
201+
return
255202

256203
click.echo(f"Starting up chassis module {chassis_module_name}")
257204
fvs = {

scripts/reboot_smartswitch_helper

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ function get_reboot_status()
7171
function module_pre_shutdown()
7272
{
7373
local DPU_NAME=$1
74-
local DPU_BUS_INFO=$2
7574
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.module_pre_shutdown('${DPU_NAME}')"
7675
if [ $? -ne 0 ]; then
7776
log_message "ERROR: Module pre-shutdown vendor API failed"
@@ -82,13 +81,46 @@ function module_pre_shutdown()
8281
function module_post_startup()
8382
{
8483
local DPU_NAME=$1
85-
local DPU_BUS_INFO=$2
8684
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.module_post_startup('${DPU_NAME}')"
8785
if [ $? -ne 0 ]; then
8886
log_message "ERROR: Module post-startup vendor API failed"
8987
fi
9088
}
9189

90+
# Function to set state_transition_in_progress flag
91+
function set_module_state_transition_flag()
92+
{
93+
local DPU_NAME=$1
94+
local FLAG_VALUE=$2
95+
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.set_module_state_transition('${DPU_NAME}', ${FLAG_VALUE})"
96+
if [ $? -ne 0 ]; then
97+
log_message "ERROR: Setting module state transition flag failed"
98+
fi
99+
}
100+
101+
# Function to clear state_transition_in_progress flag
102+
function clear_module_state_transition_flag()
103+
{
104+
local DPU_NAME=$1
105+
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.clear_module_state_transition('${DPU_NAME}')"
106+
if [ $? -ne 0 ]; then
107+
log_message "ERROR: Clearing module state transition flag failed"
108+
fi
109+
}
110+
111+
# Function to get state_transition_in_progress flag
112+
function get_module_state_transition_flag()
113+
{
114+
local DPU_NAME=$1
115+
local result
116+
result=$(python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); print(helper.get_module_state_transition('${DPU_NAME}'))")
117+
if [ "$result" == "True" ]; then
118+
return 0 # True in bash (success)
119+
else
120+
return 1 # False in bash (failure)
121+
fi
122+
}
123+
92124
# Function to reboot DPU
93125
function reboot_dpu_platform()
94126
{
@@ -119,6 +151,8 @@ function wait_for_dpu_reboot_status()
119151
local poll_interval=5
120152
local waited_time=0
121153
while true; do
154+
sleep "$poll_interval"
155+
122156
local reboot_status
123157
get_reboot_status "${dpu_ip}" "${port}"
124158
reboot_status=$?
@@ -127,7 +161,6 @@ function wait_for_dpu_reboot_status()
127161
break
128162
fi
129163

130-
sleep "$poll_interval"
131164
waited_time=$((waited_time + poll_interval))
132165
if [ $waited_time -ge $dpu_halt_services_timeout ]; then
133166
log_message "ERROR: Timeout waiting for ${DPU_NAME} to finish halting the services"
@@ -171,7 +204,7 @@ function reboot_dpu()
171204
local REBOOT_TYPE=$2
172205
local DPU_INDEX=${DPU_NAME//[!0-9]/}
173206

174-
debug "User requested rebooting device ${DPU_NAME} ..."
207+
log_message "User requested rebooting device ${DPU_NAME} ..."
175208

176209
# Check if the DPU operation status is online before rebooting
177210
local oper_status
@@ -187,6 +220,16 @@ function reboot_dpu()
187220
fi
188221
fi
189222

223+
if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
224+
# get and set the state_transition_in_progress flag before reboot
225+
if ! get_module_state_transition_flag "${DPU_NAME}"; then
226+
set_module_state_transition_flag "${DPU_NAME}" True
227+
else
228+
log_message "ERROR: state_transition_in_progress flag is already set for ${DPU_NAME}"
229+
return ${EXIT_ERROR}
230+
fi
231+
fi
232+
190233
# Send reboot command to DPU
191234
gnmi_reboot_dpu "${DPU_NAME}"
192235
if [ $? -ne 0 ]; then
@@ -199,20 +242,32 @@ function reboot_dpu()
199242
return ${EXIT_ERROR}
200243
fi
201244

202-
module_pre_shutdown ${DPU_NAME} ${DPU_BUS_INFO}
245+
module_pre_shutdown "${DPU_NAME}"
203246
if [ $? -ne 0 ]; then
204-
log_message "ERROR: Failed to detach PCI module for ${DPU_NAME}"
247+
log_message "ERROR: Failed to pre-shutdown the module for ${DPU_NAME}"
248+
if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
249+
clear_module_state_transition_flag "${DPU_NAME}"
250+
fi
205251
return ${EXIT_ERROR}
206252
fi
207253

208-
reboot_dpu_platform ${DPU_NAME} ${REBOOT_TYPE}
254+
reboot_dpu_platform "${DPU_NAME}" "${REBOOT_TYPE}"
209255
if [ $? -ne 0 ]; then
210-
log_message "ERROR: Failed to send platform command to reboot ${DPU_NAME}"
256+
log_message "ERROR: Failed to reboot the module for ${DPU_NAME}"
257+
if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
258+
clear_module_state_transition_flag "${DPU_NAME}"
259+
fi
211260
return ${EXIT_ERROR}
212261
fi
213262

214263
if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
215-
module_post_startup ${DPU_NAME} ${DPU_BUS_INFO}
264+
module_post_startup "${DPU_NAME}"
265+
if [ $? -ne 0 ]; then
266+
log_message "ERROR: Failed to startup the module for ${DPU_NAME}"
267+
clear_module_state_transition_flag "${DPU_NAME}"
268+
return ${EXIT_ERROR}
269+
fi
270+
clear_module_state_transition_flag "${DPU_NAME}"
216271
fi
217272
}
218273

0 commit comments

Comments
 (0)