diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 1e26b49f1..fa5da2654 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -41,6 +41,8 @@ def try_get(callback, default=NOT_AVAILABLE): """ try: ret = callback() + if ret is None: + ret = default except NotImplementedError: ret = default @@ -174,16 +176,19 @@ class FanUpdater(object): :return: """ logger.log_debug("Start fan updating") - try: - for index, fan in enumerate(self.chassis.get_all_fans()): + for index, fan in enumerate(self.chassis.get_all_fans()): + try: self._refresh_fan_status(fan, index) + except Exception as e: + logger.log_warning('Failed to update FAN status - {}'.format(e)) - for psu_index, psu in enumerate(self.chassis.get_all_psus()): - psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index)) - for fan_index, fan in enumerate(psu.get_all_fans()): + for psu_index, psu in enumerate(self.chassis.get_all_psus()): + psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index)) + for fan_index, fan in enumerate(psu.get_all_fans()): + try: self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name)) - except Exception as e: - logger.log_warning('Failed to update FAN status - {}'.format(e)) + except Exception as e: + logger.log_warning('Failed to update PSU FAN status - {}'.format(e)) logger.log_debug("End fan updating") @@ -201,10 +206,18 @@ class FanUpdater(object): fan_status = self.fan_status_dict[fan_name] + speed = NOT_AVAILABLE + speed_tolerance = NOT_AVAILABLE + speed_target = NOT_AVAILABLE + fan_fault_status = NOT_AVAILABLE + fan_direction = NOT_AVAILABLE presence = try_get(fan.get_presence, False) - speed = try_get(fan.get_speed) - speed_tolerance = try_get(fan.get_speed_tolerance) - speed_target = try_get(fan.get_target_speed) + if presence: + speed = try_get(fan.get_speed) + speed_tolerance = try_get(fan.get_speed_tolerance) + speed_target = try_get(fan.get_target_speed) + fan_fault_status = try_get(fan.get_status, False) + fan_direction = try_get(fan.get_direction) set_led = False if fan_status.set_presence(presence): @@ -215,7 +228,7 @@ class FanUpdater(object): 'the system, potential overheat hazard'.format(fan_name) ) - if fan_status.set_under_speed(speed, speed_target, speed_tolerance): + if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance): set_led = True log_on_status_changed(not fan_status.under_speed, 'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name), @@ -223,7 +236,7 @@ class FanUpdater(object): format(fan_name, speed, speed_target, speed_tolerance) ) - if fan_status.set_over_speed(speed, speed_target, speed_tolerance): + if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance): set_led = True log_on_status_changed(not fan_status.over_speed, 'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name), @@ -240,8 +253,8 @@ class FanUpdater(object): [('presence', str(presence)), ('model', str(try_get(fan.get_model))), ('serial', str(try_get(fan.get_serial))), - ('status', str(try_get(fan.get_status, False))), - ('direction', str(try_get(fan.get_direction))), + ('status', str(fan_fault_status)), + ('direction', str(fan_direction)), ('speed', str(speed)), ('speed_tolerance', str(speed_tolerance)), ('speed_target', str(speed_target)), @@ -378,12 +391,12 @@ class TemperatureUpdater(object): Update all temperature information to database :return: """ - logger.log_debug("Start temperature updating") - try: - for index, thermal in enumerate(self.chassis.get_all_thermals()): + logger.log_debug("Start temperature updating") + for index, thermal in enumerate(self.chassis.get_all_thermals()): + try: self._refresh_temperature_status(thermal, index) - except Exception as e: - logger.log_warning('Failed to update thermal status - {}'.format(e)) + except Exception as e: + logger.log_warning('Failed to update thermal status - {}'.format(e)) logger.log_debug("End temperature updating") @@ -400,26 +413,33 @@ class TemperatureUpdater(object): temperature_status = self.temperature_status_dict[name] + high_threshold = NOT_AVAILABLE + low_threshold = NOT_AVAILABLE + high_critical_threshold = NOT_AVAILABLE + low_critical_threshold = NOT_AVAILABLE temperature = try_get(thermal.get_temperature) - temperature_status.set_temperature(name, temperature) - high_threshold = try_get(thermal.get_high_threshold) - low_threshold = try_get(thermal.get_low_threshold) - + if temperature != NOT_AVAILABLE: + temperature_status.set_temperature(name, temperature) + high_threshold = try_get(thermal.get_high_threshold) + low_threshold = try_get(thermal.get_low_threshold) + high_critical_threshold = try_get(thermal.get_high_critical_threshold) + low_critical_threshold = try_get(thermal.get_low_critical_threshold) + warning = False - if temperature_status.set_over_temperature(temperature, high_threshold): + if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold): log_on_status_changed(not temperature_status.over_temperature, - 'High temperature warning: {} current temperature {}C, high threshold {}C'. - format(name, temperature, high_threshold), 'High temperature warning cleared: {} temperature restore to {}C, high threshold {}C.'. + format(name, temperature, high_threshold), + 'High temperature warning: {} current temperature {}C, high threshold {}C'. format(name, temperature, high_threshold) ) warning = warning | temperature_status.over_temperature - if temperature_status.set_under_temperature(temperature, low_threshold): + if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold): log_on_status_changed(not temperature_status.under_temperature, - 'Low temperature warning: {} current temperature {}C, low threshold {}C'. - format(name, temperature, low_threshold), 'Low temperature warning cleared: {} temperature restore to {}C, low threshold {}C.'. + format(name, temperature, low_threshold), + 'Low temperature warning: {} current temperature {}C, low threshold {}C'. format(name, temperature, low_threshold) ) warning = warning | temperature_status.under_temperature @@ -429,8 +449,8 @@ class TemperatureUpdater(object): ('high_threshold', str(high_threshold)), ('low_threshold', str(low_threshold)), ('warning_status', str(warning)), - ('critical_high_threshold', str(try_get(thermal.get_high_critical_threshold))), - ('critical_low_threshold', str(try_get(thermal.get_low_critical_threshold))), + ('critical_high_threshold', str(high_critical_threshold)), + ('critical_low_threshold', str(low_critical_threshold)), ('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S')) ]) diff --git a/sonic-thermalctld/tests/mock_platform.py b/sonic-thermalctld/tests/mock_platform.py index ddbaf404a..df2c65181 100644 --- a/sonic-thermalctld/tests/mock_platform.py +++ b/sonic-thermalctld/tests/mock_platform.py @@ -68,6 +68,11 @@ def make_normal_speed(self): self.speed_tolerance = 0 +class MockErrorFan(MockFan): + def get_speed(self): + raise Exception('Fail to get speed') + + class MockPsu(MockDevice): def __init__(self): self.fan_list = [] @@ -118,6 +123,11 @@ def make_normal_temper(self): self.temperature = 2 self.low_threshold = 1 + +class MockErrorThermal(MockThermal): + def get_temperature(self): + raise Exception('Fail to get temperature') + class MockChassis: def __init__(self): @@ -149,6 +159,10 @@ def make_over_speed_fan(self): fan.make_over_speed() self.fan_list.append(fan) + def make_error_fan(self): + fan = MockErrorFan() + self.fan_list.append(fan) + def make_over_temper_thermal(self): thermal = MockThermal() thermal.make_over_temper() @@ -159,3 +173,7 @@ def make_under_temper_thermal(self): thermal.make_under_temper() self.thermal_list.append(thermal) + def make_error_thermal(self): + thermal = MockErrorThermal() + self.thermal_list.append(thermal) + diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index ac6bd5106..f2f0b05bd 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -2,7 +2,7 @@ import sys from mock import Mock, MagicMock, patch from sonic_daemon_base import daemon_base -from .mock_platform import MockChassis, MockFan +from .mock_platform import MockChassis, MockFan, MockThermal daemon_base.db_connect = MagicMock() @@ -198,3 +198,27 @@ def test_temperupdater_under_temper(): temperature_updater.update() logger.log_notice.assert_called_once() + +def test_update_fan_with_exception(): + chassis = MockChassis() + chassis.make_error_fan() + fan = MockFan() + fan.make_over_speed() + chassis.get_all_fans().append(fan) + + fan_updater = FanUpdater(chassis) + fan_updater.update() + assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED + logger.log_warning.assert_called() + + +def test_update_thermal_with_exception(): + chassis = MockChassis() + chassis.make_error_thermal() + thermal = MockThermal() + thermal.make_over_temper() + chassis.get_all_thermals().append(thermal) + + temperature_updater = TemperatureUpdater(chassis) + temperature_updater.update() + logger.log_warning.assert_called()