diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py index 48ad612590e..08c0ebd646a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py @@ -19,6 +19,8 @@ from . import utils from sonic_py_common import logger +import atexit +import functools import sys import time @@ -47,6 +49,23 @@ TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json' logger = logger.Logger('thermal-updater') +# Register a clean-up routine that will run when the process exits +def clean_thermal_data(sfp_list): + if not sfp_list: + return + hw_management_independent_mode_update.module_data_set_module_counter(len(sfp_list)) + for sfp in sfp_list: + try: + sw_control = sfp.is_sw_control() + if not sw_control: + continue + + hw_management_independent_mode_update.thermal_data_clean_module( + 0, + sfp.sdk_index + 1 + ) + except Exception as e: + logger.log_warning(f'Cleanup skipped for module {sfp.sdk_index + 1}: {e}') class ThermalUpdater: def __init__(self, sfp_list, update_asic=True): @@ -55,6 +74,8 @@ def __init__(self, sfp_list, update_asic=True): self._timer = utils.Timer() self._update_asic = update_asic + atexit.register(functools.partial(clean_thermal_data, self._sfp_list)) + def load_tc_config(self): asic_poll_interval = 1 sfp_poll_interval = 10 @@ -83,7 +104,6 @@ def load_tc_config(self): self._timer.schedule(sfp_poll_interval, self.update_module) def start(self): - self.clean_thermal_data() self.control_tc(False) self.load_tc_config() self._timer.start() @@ -96,15 +116,6 @@ def control_tc(self, suspend): logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}') utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0) - def clean_thermal_data(self): - hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list)) - hw_management_independent_mode_update.thermal_data_clean_asic(0) - for sfp in self._sfp_list: - hw_management_independent_mode_update.thermal_data_clean_module( - 0, - sfp.sdk_index + 1 - ) - def get_asic_temp(self): temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None) return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None diff --git a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py index ce87afa6ddd..bbb7d8b24a6 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_smartswsitch_thermal_updater.py @@ -1,6 +1,6 @@ # # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -68,8 +68,6 @@ def test_configuration(self, mock_write): hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id()) - hw_management_independent_mode_update.thermal_data_clean_asic.assert_called_once() - hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once() mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0) assert updater._timer.schedule.call_count == 3 # Called for DPU with time 24/2 = 12 diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py index 2b77f5db1ec..05ff8d7e53f 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py @@ -20,7 +20,7 @@ from unittest import mock from sonic_platform import utils -from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update +from sonic_platform.thermal_updater import ThermalUpdater, clean_thermal_data, hw_management_independent_mode_update from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \ ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD @@ -78,6 +78,7 @@ def test_start_stop(self, mock_write): @mock.patch('sonic_platform.utils.read_int_from_file') def test_update_asic(self, mock_read): + hw_management_independent_mode_update.reset_mock() mock_read.return_value = 8 updater = ThermalUpdater(None) assert updater.get_asic_temp() == 1000 @@ -92,6 +93,7 @@ def test_update_asic(self, mock_read): assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD def test_update_module(self): + hw_management_independent_mode_update.reset_mock() mock_sfp = mock.MagicMock() mock_sfp.sdk_index = 10 mock_sfp.get_presence = mock.MagicMock(return_value=True) @@ -109,3 +111,36 @@ def test_update_module(self): hw_management_independent_mode_update.reset_mock() updater.update_module() hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0) + + @mock.patch('sonic_platform.thermal_updater.clean_thermal_data') + @mock.patch('sonic_platform.thermal_updater.atexit.register') + def test_registers_exit_cleanup(self, mock_register, mock_clean): + hw_management_independent_mode_update.reset_mock() + sfp = mock.MagicMock() + updater = ThermalUpdater([sfp]) + + mock_register.assert_called_once() + exit_callback = mock_register.call_args[0][0] + + # Ensure clean routine is not run during construction/start + mock_clean.assert_not_called() + + # Simulate process exit and confirm cleanup uses the bound SFP list + exit_callback() + mock_clean.assert_called_once_with([sfp]) + + def test_clean_thermal_data_only_sw_control_modules(self): + hw_management_independent_mode_update.reset_mock() + + sfp_sw = mock.MagicMock() + sfp_sw.sdk_index = 3 + sfp_sw.is_sw_control = mock.MagicMock(return_value=True) + + sfp_no_sw = mock.MagicMock() + sfp_no_sw.sdk_index = 4 + sfp_no_sw.is_sw_control = mock.MagicMock(return_value=False) + + clean_thermal_data([sfp_sw, sfp_no_sw]) + + hw_management_independent_mode_update.module_data_set_module_counter.assert_called_once_with(2) + hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, sfp_sw.sdk_index + 1)