Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Thermal control daemon for SONiC
"""

from enum import Enum, auto
import signal
import sys
import threading
Expand Down Expand Up @@ -61,18 +62,23 @@ def update_entity_info(table, parent_name, key, device, device_index):
table.set(key, fvs)


class FanType(Enum):
DRAWER = auto()
PSU = auto()
MODULE = auto()

class FanStatus(logger.Logger):
absent_fan_count = 0
faulty_fan_count = 0

def __init__(self, fan=None, is_psu_fan=False):
def __init__(self, fan=None, fan_type=FanType.DRAWER):
"""
Initializer of FanStatus
"""
super(FanStatus, self).__init__(SYSLOG_IDENTIFIER)

self.fan = fan
self.is_psu_fan = is_psu_fan
self.fan_type = fan_type
self.presence = True
self.status = True
self.under_speed = False
Expand All @@ -95,7 +101,7 @@ class FanStatus(logger.Logger):
:param presence: Fan presence status
:return: True if status changed else False
"""
if not presence and not self.is_psu_fan:
if not presence and self.fan_type == FanType.DRAWER:
FanStatus.absent_fan_count += 1

if presence == self.presence:
Expand Down Expand Up @@ -228,16 +234,25 @@ class FanUpdater(logger.Logger):
if self.task_stopping_event.is_set():
return
try:
self._refresh_fan_status(drawer, drawer_index, fan, fan_index)
self._refresh_fan_status(drawer, drawer_index, fan, fan_index, FanType.DRAWER)
except Exception as e:
self.log_warning('Failed to update fan status - {}'.format(repr(e)))

for module_index, module in enumerate(self.chassis.get_all_modules()):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thermals first check is_chassis_system before looping over module details, should that be done here as well? Or it's perhaps an unnecessary check there? https://github.com/sonic-net/sonic-platform-daemons/blob/master/sonic-thermalctld/scripts/thermalctld#L599 That also looks at PSUs connected to the modules, which may themselves have fans.

The PR title describes this as only for non-CPU modules, but it looks like that is not actually a limitation; won't this get all fans in any module where they are configured? Not necessarily a bad thing, but worth noting.

for fan_index, fan in enumerate(module.get_all_fans()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_fan_status(module, module_index, fan, fan_index, FanType.MODULE)
except Exception as e:
self.log_warning('Failed to update module fan status - {}'.format(repr(e)))

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
for fan_index, fan in enumerate(psu.get_all_fans()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_fan_status(psu, psu_index, fan, fan_index, True)
self._refresh_fan_status(psu, psu_index, fan, fan_index, FanType.PSU)
except Exception as e:
self.log_warning('Failed to update PSU fan status - {}'.format(repr(e)))

Expand Down Expand Up @@ -270,7 +285,7 @@ class FanUpdater(logger.Logger):

self.drawer_table.set(drawer_name, fvs)

def _refresh_fan_status(self, parent, parent_index, fan, fan_index, is_psu_fan=False):
def _refresh_fan_status(self, parent, parent_index, fan, fan_index, fan_type=FanType.DRAWER):
"""
Get Fan status by platform API and write to database for a given Fan
:param parent: Parent device of this fan
Expand All @@ -280,15 +295,17 @@ class FanUpdater(logger.Logger):
:param name_prefix: name prefix of Fan object if Fan.get_name not presented
:return:
"""
drawer_name = NOT_AVAILABLE if is_psu_fan else str(try_get(parent.get_name))
if is_psu_fan:
drawer_name = NOT_AVAILABLE if fan_type != FanType.DRAWER else str(try_get(parent.get_name))
if fan_type == FanType.PSU:
parent_name = 'PSU {}'.format(parent_index + 1)
elif fan_type == FanType.MODULE:
parent_name = 'Module {}'.format(parent_index + 1)
else:
parent_name = drawer_name if drawer_name != NOT_AVAILABLE else CHASSIS_INFO_KEY
fan_name = try_get(fan.get_name, '{} fan {}'.format(parent_name, fan_index + 1))
update_entity_info(self.phy_entity_table, parent_name, fan_name, fan, fan_index + 1)
if fan_name not in self.fan_status_dict:
self.fan_status_dict[fan_name] = FanStatus(fan, is_psu_fan)
self.fan_status_dict[fan_name] = FanStatus(fan, fan_type)

fan_status = self.fan_status_dict[fan_name]

Expand Down Expand Up @@ -342,7 +359,7 @@ class FanUpdater(logger.Logger):

# We don't set PSU led here, PSU led will be handled in psud
if set_led:
if not is_psu_fan:
if fan_type == FanType.DRAWER:
self._set_fan_led(parent, fan, fan_name, fan_status)

if fan_fault_status != NOT_AVAILABLE:
Expand Down
2 changes: 2 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,10 @@ def make_module_thermal(self):
sfp._thermal_list.append(MockThermal())
psu = MockPsu()
psu._thermal_list.append(MockThermal())
fan = MockFan()
module._sfp_list.append(sfp)
module._psu_list.append(psu)
module._fan_list.append(fan)
module._thermal_list.append(MockThermal())

def is_modular_chassis(self):
Expand Down
22 changes: 21 additions & 1 deletion sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from sonic_py_common import daemon_base

from .mock_platform import MockChassis, MockFan, MockPsu, MockSfp, MockThermal
from .mock_platform import MockChassis, MockFan, MockModule, MockPsu, MockSfp, MockThermal
from .mock_swsscommon import Table

daemon_base.db_connect = mock.MagicMock()
Expand Down Expand Up @@ -267,6 +267,26 @@ def test_update_psu_fans(self):
else:
fan_updater.log_warning.assert_called_with("Failed to update PSU fan status - Exception('Test message',)")

def test_update_module_fans(self):
chassis = MockChassis()
module = MockModule()
mock_fan = MockFan()
chassis.set_modular_chassis(True)
module._fan_list.append(mock_fan)
chassis._module_list.append(module)
fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event())
fan_updater.update()
assert fan_updater.log_warning.call_count == 0

fan_updater._refresh_fan_status = mock.MagicMock(side_effect=Exception("Test message"))
fan_updater.update()
assert fan_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
fan_updater.log_warning.assert_called_with("Failed to update module fan status - Exception('Test message')")
else:
fan_updater.log_warning.assert_called_with("Failed to update module fan status - Exception('Test message',)")

class TestThermalMonitor(object):
"""
Expand Down