Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions scripts/gnoi_shutdown_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,28 @@ def _handle_transition(self, dpu_name: str, transition_type: str) -> bool:
"""
logger.log_notice(f"{dpu_name}: Starting gNOI shutdown sequence")

# Check if DPU is already powered off / offline before attempting gNOI shutdown.
# This avoids error logs when config reload or reboot is issued while DPUs are
# already in the down state (e.g. admin_status was previously set to "down").
try:
module_index = self._chassis.get_module_index(dpu_name)
if module_index >= 0:
module = self._chassis.get_module(module_index)
if module is not None:
oper_status = module.get_oper_status()
if oper_status != "Online":
logger.log_notice(
f"{dpu_name}: DPU is already in '{oper_status}' state, "
"skipping gNOI shutdown sequence"
)
self._clear_halt_flag(dpu_name)
return True
except Exception as e:
logger.log_warning(
f"{dpu_name}: Could not determine operational status ({e}), "
"proceeding with gNOI shutdown"
)

# Wait for platform PCI detach completion
if not self._wait_for_gnoi_halt_in_progress(dpu_name):
logger.log_warning(f"{dpu_name}: Timeout waiting for PCI detach, proceeding anyway")
Expand Down Expand Up @@ -228,12 +250,12 @@ def _clear_halt_flag(self, dpu_name: str) -> bool:
if module_index < 0:
logger.log_error(f"{dpu_name}: Unable to get module index from chassis")
return False

module = self._chassis.get_module(module_index)
if module is None:
logger.log_error(f"{dpu_name}: Module at index {module_index} not found in chassis")
return False

module.clear_module_gnoi_halt_in_progress()
logger.log_info(f"{dpu_name}: Successfully cleared halt_in_progress flag (module index: {module_index})")
return True
Expand Down
86 changes: 74 additions & 12 deletions tests/gnoi_shutdown_daemon_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execut

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = "Online"
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

Expand All @@ -208,8 +209,6 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execut
result = handler._handle_transition("DPU0", "shutdown")

self.assertTrue(result)
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()
self.assertEqual(mock_execute_command.call_count, 2)

Expand Down Expand Up @@ -248,6 +247,7 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = "Online"
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

Expand All @@ -257,8 +257,6 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo

# Should still succeed - code proceeds anyway after timeout warning
self.assertTrue(result)
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_get_dpu_ip_and_port(self):
Expand Down Expand Up @@ -296,20 +294,19 @@ def test_handle_transition_ip_failure(self, mock_get_gnmi_port, mock_get_dpu_ip,

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = "Online"
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock _wait_for_gnoi_halt_in_progress to return immediately to prevent hanging
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)

result = handler._handle_transition("DPU0", "shutdown")

self.assertFalse(result)
# Verify that clear_module_gnoi_halt_in_progress was called
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

@patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1")
Expand Down Expand Up @@ -519,22 +516,87 @@ def test_handle_transition_config_exception(self, mock_get_port, mock_get_ip, mo

# Mock module for clear operation
mock_module = MagicMock()
mock_module.get_oper_status.return_value = "Online"
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock _wait_for_gnoi_halt_in_progress to return immediately to prevent hanging
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)

result = handler._handle_transition("DPU0", "shutdown")

self.assertFalse(result)
# Verify that clear_module_gnoi_halt_in_progress was called
mock_chassis.get_module_index.assert_called_with("DPU0")
mock_chassis.get_module.assert_called_with(0)
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_handle_transition_dpu_already_offline(self):
"""Test that gNOI shutdown is skipped when DPU is already offline."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module with Offline oper_status
mock_module = MagicMock()
mock_module.get_oper_status.return_value = "Offline"
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)
result = handler._handle_transition("DPU0", "shutdown")

# Should return True (success) without attempting gNOI reboot
self.assertTrue(result)
mock_module.get_oper_status.assert_called_once()
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_handle_transition_dpu_powered_down(self):
"""Test that gNOI shutdown is skipped when DPU is in PoweredDown state."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module with PoweredDown oper_status
mock_module = MagicMock()
mock_module.get_oper_status.return_value = "PoweredDown"
mock_chassis.get_module_index.return_value = 0
mock_chassis.get_module.return_value = mock_module

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)
result = handler._handle_transition("DPU0", "shutdown")

# Should return True (success) without attempting gNOI reboot
self.assertTrue(result)
mock_module.get_oper_status.assert_called_once()
mock_module.clear_module_gnoi_halt_in_progress.assert_called_once()

def test_handle_transition_oper_status_check_exception(self):
"""Test that gNOI shutdown proceeds when oper_status check raises exception."""
mock_db = MagicMock()
mock_config_db = MagicMock()
mock_chassis = MagicMock()

# Mock module to raise exception on get_oper_status
mock_chassis.get_module_index.side_effect = Exception("Platform error")

handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis)

# Mock remaining methods to prevent actual gNOI calls
handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True)
handler._send_reboot_command = MagicMock(return_value=True)
handler._poll_reboot_status = MagicMock(return_value=True)
handler._clear_halt_flag = MagicMock(return_value=True)

with patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1"), \
patch('gnoi_shutdown_daemon.get_dpu_gnmi_port', return_value="8080"):
result = handler._handle_transition("DPU0", "shutdown")

# Should proceed with shutdown despite oper_status check failure
self.assertTrue(result)
handler._wait_for_gnoi_halt_in_progress.assert_called_once()
handler._send_reboot_command.assert_called_once()


if __name__ == '__main__':
unittest.main()
Loading