Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 51 additions & 31 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def try_get(callback, default=NOT_AVAILABLE):
"""
try:
ret = callback()
if ret is None:
ret = default
except NotImplementedError:
ret = default

Expand Down Expand Up @@ -174,16 +176,19 @@ class FanUpdater(object):
:return:
"""
logger.log_debug("Start fan updating")
try:
for index, fan in enumerate(self.chassis.get_all_fans()):
for index, fan in enumerate(self.chassis.get_all_fans()):
try:
self._refresh_fan_status(fan, index)
except Exception as e:
logger.log_warning('Failed to update FAN status - {}'.format(e))

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
for fan_index, fan in enumerate(psu.get_all_fans()):
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
for fan_index, fan in enumerate(psu.get_all_fans()):
try:
self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name))
except Exception as e:
logger.log_warning('Failed to update FAN status - {}'.format(e))
except Exception as e:
logger.log_warning('Failed to update PSU FAN status - {}'.format(e))

logger.log_debug("End fan updating")

Expand All @@ -201,10 +206,18 @@ class FanUpdater(object):

fan_status = self.fan_status_dict[fan_name]

speed = NOT_AVAILABLE
speed_tolerance = NOT_AVAILABLE
speed_target = NOT_AVAILABLE
fan_fault_status = NOT_AVAILABLE
fan_direction = NOT_AVAILABLE
presence = try_get(fan.get_presence, False)
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)
speed_target = try_get(fan.get_target_speed)
if presence:
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)
speed_target = try_get(fan.get_target_speed)
fan_fault_status = try_get(fan.get_status, False)
fan_direction = try_get(fan.get_direction)

set_led = False
if fan_status.set_presence(presence):
Expand All @@ -215,15 +228,15 @@ class FanUpdater(object):
'the system, potential overheat hazard'.format(fan_name)
)

if fan_status.set_under_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
set_led = True
log_on_status_changed(not fan_status.under_speed,
'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name),
'Fan under speed warning: {} current speed={}, target speed={}, tolerance={}.'.
format(fan_name, speed, speed_target, speed_tolerance)
)

if fan_status.set_over_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
set_led = True
log_on_status_changed(not fan_status.over_speed,
'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name),
Expand All @@ -240,8 +253,8 @@ class FanUpdater(object):
[('presence', str(presence)),
('model', str(try_get(fan.get_model))),
('serial', str(try_get(fan.get_serial))),
('status', str(try_get(fan.get_status, False))),
('direction', str(try_get(fan.get_direction))),
('status', str(fan_fault_status)),
('direction', str(fan_direction)),
('speed', str(speed)),
('speed_tolerance', str(speed_tolerance)),
('speed_target', str(speed_target)),
Expand Down Expand Up @@ -378,12 +391,12 @@ class TemperatureUpdater(object):
Update all temperature information to database
:return:
"""
logger.log_debug("Start temperature updating")
try:
for index, thermal in enumerate(self.chassis.get_all_thermals()):
logger.log_debug("Start temperature updating")
for index, thermal in enumerate(self.chassis.get_all_thermals()):
try:
self._refresh_temperature_status(thermal, index)
except Exception as e:
logger.log_warning('Failed to update thermal status - {}'.format(e))
except Exception as e:
logger.log_warning('Failed to update thermal status - {}'.format(e))

logger.log_debug("End temperature updating")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remove debugging log, which shows up every 60s in syslog

DEBUG pmon#thermalctld: Start temperature updating
DEBUG pmon#thermalctld: End temperature updating

@Junchao-Mellanox

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, will remove it in another PR.


Expand All @@ -400,26 +413,33 @@ class TemperatureUpdater(object):

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
temperature_status.set_temperature(name, temperature)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)

if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature_status.set_over_temperature(temperature, high_threshold):
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning cleared: {} temperature restore to {}C, high threshold {}C.'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature_status.set_under_temperature(temperature, low_threshold):
if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning cleared: {} temperature restore to {}C, low threshold {}C.'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature
Expand All @@ -429,8 +449,8 @@ class TemperatureUpdater(object):
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(try_get(thermal.get_high_critical_threshold))),
('critical_low_threshold', str(try_get(thermal.get_low_critical_threshold))),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])

Expand Down
18 changes: 18 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def make_normal_speed(self):
self.speed_tolerance = 0


class MockErrorFan(MockFan):
def get_speed(self):
raise Exception('Fail to get speed')


class MockPsu(MockDevice):
def __init__(self):
self.fan_list = []
Expand Down Expand Up @@ -118,6 +123,11 @@ def make_normal_temper(self):
self.temperature = 2
self.low_threshold = 1


class MockErrorThermal(MockThermal):
def get_temperature(self):
raise Exception('Fail to get temperature')


class MockChassis:
def __init__(self):
Expand Down Expand Up @@ -149,6 +159,10 @@ def make_over_speed_fan(self):
fan.make_over_speed()
self.fan_list.append(fan)

def make_error_fan(self):
fan = MockErrorFan()
self.fan_list.append(fan)

def make_over_temper_thermal(self):
thermal = MockThermal()
thermal.make_over_temper()
Expand All @@ -159,3 +173,7 @@ def make_under_temper_thermal(self):
thermal.make_under_temper()
self.thermal_list.append(thermal)

def make_error_thermal(self):
thermal = MockErrorThermal()
self.thermal_list.append(thermal)

26 changes: 25 additions & 1 deletion sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
from mock import Mock, MagicMock, patch
from sonic_daemon_base import daemon_base
from .mock_platform import MockChassis, MockFan
from .mock_platform import MockChassis, MockFan, MockThermal

daemon_base.db_connect = MagicMock()

Expand Down Expand Up @@ -198,3 +198,27 @@ def test_temperupdater_under_temper():
temperature_updater.update()
logger.log_notice.assert_called_once()


def test_update_fan_with_exception():
chassis = MockChassis()
chassis.make_error_fan()
fan = MockFan()
fan.make_over_speed()
chassis.get_all_fans().append(fan)

fan_updater = FanUpdater(chassis)
fan_updater.update()
assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED
logger.log_warning.assert_called()


def test_update_thermal_with_exception():
chassis = MockChassis()
chassis.make_error_thermal()
thermal = MockThermal()
thermal.make_over_temper()
chassis.get_all_thermals().append(thermal)

temperature_updater = TemperatureUpdater(chassis)
temperature_updater.update()
logger.log_warning.assert_called()