Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 9 additions & 62 deletions config/chassis_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import utilities_common.cli as clicommon
from utilities_common.chassis import is_smartswitch, get_all_dpus
from utilities_common.module import ModuleHelper
from datetime import datetime, timedelta, timezone

TIMEOUT_SECS = 10
Expand Down Expand Up @@ -64,50 +65,6 @@ def get_config_module_state(db, chassis_module_name):
else:
return fvs['admin_status']


def get_state_transition_in_progress(db, chassis_module_name):
ensure_statedb_connected(db)
fvs = db.statedb.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
value = fvs.get('state_transition_in_progress', 'False') if fvs else 'False'
return value


def set_state_transition_in_progress(db, chassis_module_name, value):
ensure_statedb_connected(db)
state_db = db.statedb
entry = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) or {}
entry['state_transition_in_progress'] = value
if value == 'True':
entry['transition_start_time'] = datetime.now(timezone.utc).isoformat()
else:
# Remove transition_start_time from both local entry and database
entry.pop('transition_start_time', None)
state_db.delete_field('CHASSIS_MODULE_TABLE', chassis_module_name, 'transition_start_time')
state_db.set_entry('CHASSIS_MODULE_TABLE', chassis_module_name, entry)


def is_transition_timed_out(db, chassis_module_name):
ensure_statedb_connected(db)
state_db = db.statedb
fvs = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
if not fvs:
return False
start_time_str = fvs.get('transition_start_time')
if not start_time_str:
return False
try:
start_time = datetime.fromisoformat(start_time_str)
except ValueError:
return False

# Use UTC everywhere for consistent comparison
current_time = datetime.now(timezone.utc)
if start_time.tzinfo is None:
# If stored time is naive, assume it's UTC
start_time = start_time.replace(tzinfo=timezone.utc)

return current_time - start_time > TRANSITION_TIMEOUT

#
# Name: check_config_module_state_with_timeout
# return: True: timeout, False: not timeout
Expand Down Expand Up @@ -196,15 +153,10 @@ def shutdown_chassis_module(db, chassis_module_name):
return

if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with shutdown.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
module_helper = ModuleHelper()
if module_helper.get_module_state_transition(chassis_module_name):
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return

click.echo(f"Shutting down chassis module {chassis_module_name}")
fvs = {
Expand Down Expand Up @@ -243,15 +195,10 @@ def startup_chassis_module(db, chassis_module_name):
return

if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with startup.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
module_helper = ModuleHelper()
if module_helper.get_module_state_transition(chassis_module_name):
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return

click.echo(f"Starting up chassis module {chassis_module_name}")
fvs = {
Expand Down
73 changes: 64 additions & 9 deletions scripts/reboot_smartswitch_helper
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ function get_reboot_status()
function module_pre_shutdown()
{
local DPU_NAME=$1
local DPU_BUS_INFO=$2
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.module_pre_shutdown('${DPU_NAME}')"
if [ $? -ne 0 ]; then
log_message "ERROR: Module pre-shutdown vendor API failed"
Expand All @@ -82,13 +81,46 @@ function module_pre_shutdown()
function module_post_startup()
{
local DPU_NAME=$1
local DPU_BUS_INFO=$2
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.module_post_startup('${DPU_NAME}')"
if [ $? -ne 0 ]; then
log_message "ERROR: Module post-startup vendor API failed"
fi
}

# Function to set state_transition_in_progress flag
function set_module_state_transition_flag()
{
local DPU_NAME=$1
local FLAG_VALUE=$2
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.set_module_state_transition('${DPU_NAME}', ${FLAG_VALUE})"
if [ $? -ne 0 ]; then
log_message "ERROR: Setting module state transition flag failed"
fi
}

# Function to clear state_transition_in_progress flag
function clear_module_state_transition_flag()
{
local DPU_NAME=$1
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.clear_module_state_transition('${DPU_NAME}')"
if [ $? -ne 0 ]; then
log_message "ERROR: Clearing module state transition flag failed"
fi
}

# Function to get state_transition_in_progress flag
function get_module_state_transition_flag()
{
local DPU_NAME=$1
local result
result=$(python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); print(helper.get_module_state_transition('${DPU_NAME}'))")
if [ "$result" == "True" ]; then
return 0 # True in bash (success)
else
return 1 # False in bash (failure)
fi
}

# Function to reboot DPU
function reboot_dpu_platform()
{
Expand Down Expand Up @@ -119,6 +151,8 @@ function wait_for_dpu_reboot_status()
local poll_interval=5
local waited_time=0
while true; do
sleep "$poll_interval"

local reboot_status
get_reboot_status "${dpu_ip}" "${port}"
reboot_status=$?
Expand All @@ -127,7 +161,6 @@ function wait_for_dpu_reboot_status()
break
fi

sleep "$poll_interval"
waited_time=$((waited_time + poll_interval))
if [ $waited_time -ge $dpu_halt_services_timeout ]; then
log_message "ERROR: Timeout waiting for ${DPU_NAME} to finish halting the services"
Expand Down Expand Up @@ -171,7 +204,7 @@ function reboot_dpu()
local REBOOT_TYPE=$2
local DPU_INDEX=${DPU_NAME//[!0-9]/}

debug "User requested rebooting device ${DPU_NAME} ..."
log_message "User requested rebooting device ${DPU_NAME} ..."

# Check if the DPU operation status is online before rebooting
local oper_status
Expand All @@ -187,6 +220,16 @@ function reboot_dpu()
fi
fi

if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
# get and set the state_transition_in_progress flag before reboot
if ! get_module_state_transition_flag "${DPU_NAME}"; then
set_module_state_transition_flag "${DPU_NAME}" True
else
log_message "ERROR: state_transition_in_progress flag is already set for ${DPU_NAME}"
return ${EXIT_ERROR}
fi
fi

# Send reboot command to DPU
gnmi_reboot_dpu "${DPU_NAME}"
if [ $? -ne 0 ]; then
Expand All @@ -199,20 +242,32 @@ function reboot_dpu()
return ${EXIT_ERROR}
fi

module_pre_shutdown ${DPU_NAME} ${DPU_BUS_INFO}
module_pre_shutdown "${DPU_NAME}"
if [ $? -ne 0 ]; then
log_message "ERROR: Failed to detach PCI module for ${DPU_NAME}"
log_message "ERROR: Failed to pre-shutdown the module for ${DPU_NAME}"
if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
clear_module_state_transition_flag "${DPU_NAME}"
fi
return ${EXIT_ERROR}
fi

reboot_dpu_platform ${DPU_NAME} ${REBOOT_TYPE}
reboot_dpu_platform "${DPU_NAME}" "${REBOOT_TYPE}"
if [ $? -ne 0 ]; then
log_message "ERROR: Failed to send platform command to reboot ${DPU_NAME}"
log_message "ERROR: Failed to reboot the module for ${DPU_NAME}"
if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
clear_module_state_transition_flag "${DPU_NAME}"
fi
return ${EXIT_ERROR}
fi

if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
module_post_startup ${DPU_NAME} ${DPU_BUS_INFO}
module_post_startup "${DPU_NAME}"
if [ $? -ne 0 ]; then
log_message "ERROR: Failed to startup the module for ${DPU_NAME}"
clear_module_state_transition_flag "${DPU_NAME}"
return ${EXIT_ERROR}
fi
clear_module_state_transition_flag "${DPU_NAME}"
fi
}

Expand Down
Loading
Loading