Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions scripts/gnoi_shutdown_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import redis
import threading
import sonic_py_common.daemon_base as daemon_base
from sonic_platform_base.module_base import ModuleBase
from sonic_py_common import syslogger
from swsscommon import swsscommon

Expand Down Expand Up @@ -126,6 +127,33 @@ def _handle_transition(self, dpu_name: str, transition_type: str) -> bool:
"""
logger.log_notice(f"{dpu_name}: Starting gNOI shutdown sequence")

# Check if DPU is already powered off / offline before attempting gNOI shutdown.
# This avoids error logs when config reload or reboot is issued while DPUs are
# already in the down state (e.g. admin_status was previously set to "down").
try:
module_index = self._chassis.get_module_index(dpu_name)
if module_index >= 0:
module = self._chassis.get_module(module_index)
if module is not None:
oper_status = module.get_oper_status()
if oper_status in (ModuleBase.MODULE_STATUS_OFFLINE,
ModuleBase.MODULE_STATUS_POWERED_DOWN):
logger.log_notice(
f"{dpu_name}: DPU is already in '{oper_status}' state, "
"skipping gNOI shutdown sequence"
)
cleared = self._clear_halt_flag(dpu_name)
if not cleared:
logger.log_warning(
f"{dpu_name}: Failed to clear halt flag while skipping gNOI shutdown"
)
return cleared
Comment on lines +134 to +150
Copy link
Contributor

@hdwhdw hdwhdw Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider extracting these logic into smaller functions like this, it avoid nesting and make it easier to read:

def _should_skip_gnoi_shutdown(self, dpu_name: str) -> Optional[bool]:
    """
    Returns:
      True  -> DPU is known to be offline/powered down; skip gNOI shutdown.
      False -> DPU is known to be online (or some other state where we proceed).
      None  -> Cannot determine status; caller should proceed with gNOI shutdown.
    """
    module_index = self._chassis.get_module_index(dpu_name)
    if module_index < 0:
        return None

    module = self._chassis.get_module(module_index)
    if module is None:
        return None

    oper_status = module.get_oper_status()
    return oper_status in (
        ModuleBase.MODULE_STATUS_OFFLINE,
        ModuleBase.MODULE_STATUS_POWERED_DOWN,
    )

except Exception as e:
logger.log_warning(
f"{dpu_name}: Could not determine operational status ({e}), "
"proceeding with gNOI shutdown"
)

# Wait for platform PCI detach completion
if not self._wait_for_gnoi_halt_in_progress(dpu_name):
logger.log_warning(f"{dpu_name}: Timeout waiting for PCI detach, proceeding anyway")
Expand Down Expand Up @@ -228,12 +256,12 @@ def _clear_halt_flag(self, dpu_name: str) -> bool:
if module_index < 0:
logger.log_error(f"{dpu_name}: Unable to get module index from chassis")
return False

module = self._chassis.get_module(module_index)
if module is None:
logger.log_error(f"{dpu_name}: Module at index {module_index} not found in chassis")
return False

module.clear_module_gnoi_halt_in_progress()
logger.log_info(f"{dpu_name}: Successfully cleared halt_in_progress flag (module index: {module_index})")
return True
Expand Down
138 changes: 126 additions & 12 deletions tests/gnoi_shutdown_daemon_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'scripts')))

import gnoi_shutdown_daemon
from sonic_platform_base.module_base import ModuleBase

# Common fixtures
mock_message = {
Expand Down Expand Up @@ -200,6 +201,7 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execut

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_ONLINE
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

Expand All @@ -208,8 +210,6 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execut
result = handler._handle_transition("DPU0", "shutdown")

self.assertTrue(result)
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()
self.assertEqual(mock_execute_command.call_count, 2)

Expand Down Expand Up @@ -248,6 +248,7 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_ONLINE
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

Expand All @@ -257,8 +258,6 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo

# Should still succeed - code proceeds anyway after timeout warning
self.assertTrue(result)
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_get_dpu_ip_and_port(self):
Expand Down Expand Up @@ -296,20 +295,19 @@ def test_handle_transition_ip_failure(self, mock_get_gnmi_port, mock_get_dpu_ip,

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_ONLINE
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock _wait_for_gnoi_halt_in_progress to return immediately to prevent hanging
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)

result = handler._handle_transition("DPU0", "shutdown")

self.assertFalse(result)
# Verify that clear_module_gnoi_halt_in_progress was called
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

@patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1")
Expand Down Expand Up @@ -519,22 +517,138 @@ def test_handle_transition_config_exception(self, mock_get_port, mock_get_ip, mo

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_ONLINE
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock _wait_for_gnoi_halt_in_progress to return immediately to prevent hanging
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)

result = handler._handle_transition("DPU0", "shutdown")

self.assertFalse(result)
# Verify that clear_module_gnoi_halt_in_progress was called
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_handle_transition_dpu_already_offline(self):
"""Test that gNOI shutdown is skipped when DPU is already offline."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module with Offline oper_status
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_OFFLINE
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)
result = handler._handle_transition("DPU0", "shutdown")

# Should return True (success) without attempting gNOI reboot
self.assertTrue(result)
mock_module.get_oper_status.assert_called_once()
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_handle_transition_dpu_powered_down(self):
"""Test that gNOI shutdown is skipped when DPU is in PoweredDown state."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module with PoweredDown oper_status
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_POWERED_DOWN
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)
result = handler._handle_transition("DPU0", "shutdown")

# Should return True (success) without attempting gNOI reboot
self.assertTrue(result)
mock_module.get_oper_status.assert_called_once()
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_handle_transition_dpu_fault_proceeds(self):
"""Test that gNOI shutdown proceeds when DPU is in Fault state."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module with Fault oper_status — should NOT skip
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_FAULT
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock remaining methods to prevent actual gNOI calls
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)
handler._send_reboot_command = MagicMock(return_value=True)
handler._poll_reboot_status = MagicMock(return_value=True)
handler._clear_halt_flag = MagicMock(return_value=True)

with patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1"), \
patch('gnoi_shutdown_daemon.get_dpu_gnmi_port', return_value="8080"):
result = handler._handle_transition("DPU0", "shutdown")

# Should proceed with shutdown for Fault state
self.assertTrue(result)
handler._wait_for_gnoi_halt_in_progress.assert_called_once()
handler._send_reboot_command.assert_called_once()

def test_handle_transition_dpu_offline_clear_halt_failure(self):
"""Test that _clear_halt_flag failure is propagated when DPU is offline."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module with Offline oper_status
mock_module = MagicMock()
mock_module.get_oper_status.return_value = ModuleBase.MODULE_STATUS_OFFLINE
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)
# Make _clear_halt_flag fail
handler._clear_halt_flag = MagicMock(return_value=False)

result = handler._handle_transition("DPU0", "shutdown")

# Should return False since _clear_halt_flag failed
self.assertFalse(result)
handler._clear_halt_flag.assert_called_once_with("DPU0")

def test_handle_transition_oper_status_check_exception(self):
"""Test that gNOI shutdown proceeds when oper_status check raises exception."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module to raise exception on get_module_index
mock_chassis.get_module_index.side_effect = Exception("Platform error")

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock remaining methods to prevent actual gNOI calls
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)
handler._send_reboot_command = MagicMock(return_value=True)
handler._poll_reboot_status = MagicMock(return_value=True)
handler._clear_halt_flag = MagicMock(return_value=True)

with patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1"), \
patch('gnoi_shutdown_daemon.get_dpu_gnmi_port', return_value="8080"):
result = handler._handle_transition("DPU0", "shutdown")

# Should proceed with shutdown despite oper_status check failure
self.assertTrue(result)
handler._wait_for_gnoi_halt_in_progress.assert_called_once()
handler._send_reboot_command.assert_called_once()


if __name__ == '__main__':
unittest.main()
Loading