Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from . import utils
from sonic_py_common import logger

import atexit
import functools
import sys
import time

Expand Down Expand Up @@ -47,6 +49,23 @@
TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json'
logger = logger.Logger('thermal-updater')

# Register a clean-up routine that will run when the process exits
def clean_thermal_data(sfp_list):
if not sfp_list:
return
hw_management_independent_mode_update.module_data_set_module_counter(len(sfp_list))
for sfp in sfp_list:
try:
sw_control = sfp.is_sw_control()
if not sw_control:
continue

hw_management_independent_mode_update.thermal_data_clean_module(
0,
sfp.sdk_index + 1
)
except Exception as e:
logger.log_warning(f'Cleanup skipped for module {sfp.sdk_index + 1}: {e}')

class ThermalUpdater:
def __init__(self, sfp_list, update_asic=True):
Expand All @@ -55,6 +74,8 @@ def __init__(self, sfp_list, update_asic=True):
self._timer = utils.Timer()
self._update_asic = update_asic

atexit.register(functools.partial(clean_thermal_data, self._sfp_list))

def load_tc_config(self):
asic_poll_interval = 1
sfp_poll_interval = 10
Expand Down Expand Up @@ -83,7 +104,6 @@ def load_tc_config(self):
self._timer.schedule(sfp_poll_interval, self.update_module)

def start(self):
self.clean_thermal_data()
self.control_tc(False)
self.load_tc_config()
self._timer.start()
Expand All @@ -96,15 +116,6 @@ def control_tc(self, suspend):
logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}')
utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0)

def clean_thermal_data(self):
hw_management_independent_mode_update.module_data_set_module_counter(len(self._sfp_list))
hw_management_independent_mode_update.thermal_data_clean_asic(0)
for sfp in self._sfp_list:
hw_management_independent_mode_update.thermal_data_clean_module(
0,
sfp.sdk_index + 1
)

def get_asic_temp(self):
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -68,8 +68,6 @@ def test_configuration(self, mock_write):
hw_management_dpu_thermal_update.thermal_data_dpu_cpu_core_clear.assert_called_once_with(dpu.get_hw_mgmt_id())
hw_management_dpu_thermal_update.thermal_data_dpu_ddr_clear.assert_called_once_with(dpu.get_hw_mgmt_id())
hw_management_dpu_thermal_update.thermal_data_dpu_drive_clear.assert_called_once_with(dpu.get_hw_mgmt_id())
hw_management_independent_mode_update.thermal_data_clean_asic.assert_called_once()
hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once()
mock_write.assert_called_once_with('/run/hw-management/config/suspend', 0)
assert updater._timer.schedule.call_count == 3
# Called for DPU with time 24/2 = 12
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from unittest import mock

from sonic_platform import utils
from sonic_platform.thermal_updater import ThermalUpdater, hw_management_independent_mode_update
from sonic_platform.thermal_updater import ThermalUpdater, clean_thermal_data, hw_management_independent_mode_update
from sonic_platform.thermal_updater import ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD, \
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD

Expand Down Expand Up @@ -78,6 +78,7 @@ def test_start_stop(self, mock_write):

@mock.patch('sonic_platform.utils.read_int_from_file')
def test_update_asic(self, mock_read):
hw_management_independent_mode_update.reset_mock()
mock_read.return_value = 8
updater = ThermalUpdater(None)
assert updater.get_asic_temp() == 1000
Expand All @@ -92,6 +93,7 @@ def test_update_asic(self, mock_read):
assert updater.get_asic_temp_critical_threshold() == ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD

def test_update_module(self):
hw_management_independent_mode_update.reset_mock()
mock_sfp = mock.MagicMock()
mock_sfp.sdk_index = 10
mock_sfp.get_presence = mock.MagicMock(return_value=True)
Expand All @@ -109,3 +111,36 @@ def test_update_module(self):
hw_management_independent_mode_update.reset_mock()
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0)

@mock.patch('sonic_platform.thermal_updater.clean_thermal_data')
@mock.patch('sonic_platform.thermal_updater.atexit.register')
def test_registers_exit_cleanup(self, mock_register, mock_clean):
hw_management_independent_mode_update.reset_mock()
sfp = mock.MagicMock()
updater = ThermalUpdater([sfp])

mock_register.assert_called_once()
exit_callback = mock_register.call_args[0][0]

# Ensure clean routine is not run during construction/start
mock_clean.assert_not_called()

# Simulate process exit and confirm cleanup uses the bound SFP list
exit_callback()
mock_clean.assert_called_once_with([sfp])

def test_clean_thermal_data_only_sw_control_modules(self):
hw_management_independent_mode_update.reset_mock()

sfp_sw = mock.MagicMock()
sfp_sw.sdk_index = 3
sfp_sw.is_sw_control = mock.MagicMock(return_value=True)

sfp_no_sw = mock.MagicMock()
sfp_no_sw.sdk_index = 4
sfp_no_sw.is_sw_control = mock.MagicMock(return_value=False)

clean_thermal_data([sfp_sw, sfp_no_sw])

hw_management_independent_mode_update.module_data_set_module_counter.assert_called_once_with(2)
hw_management_independent_mode_update.thermal_data_clean_module.assert_called_once_with(0, sfp_sw.sdk_index + 1)