diff --git a/src/system-health/health_checker/config.py b/src/system-health/health_checker/config.py index 63af22db4ad..628069c6217 100644 --- a/src/system-health/health_checker/config.py +++ b/src/system-health/health_checker/config.py @@ -44,7 +44,7 @@ def __init__(self): self.ignore_services = None self.ignore_devices = None self.user_defined_checkers = None - + self.include_devices = None def config_file_exists(self): return os.path.exists(self._config_file) @@ -72,6 +72,7 @@ def load_config(self): self.ignore_services = self._get_list_data('services_to_ignore') self.ignore_devices = self._get_list_data('devices_to_ignore') self.user_defined_checkers = self._get_list_data('user_defined_checkers') + self.include_devices = self._get_list_data('include_devices') except Exception as e: self._reset() @@ -86,6 +87,7 @@ def _reset(self): self.ignore_services = None self.ignore_devices = None self.user_defined_checkers = None + self.include_devices = None def get_led_color(self, status): """ diff --git a/src/system-health/health_checker/hardware_checker.py b/src/system-health/health_checker/hardware_checker.py index 113fd88663a..1bde5a36bf1 100644 --- a/src/system-health/health_checker/hardware_checker.py +++ b/src/system-health/health_checker/hardware_checker.py @@ -1,8 +1,11 @@ from natsort import natsorted +from swsscommon import swsscommon from swsscommon.swsscommon import SonicV2Connector from .health_checker import HealthChecker +EVENTS_PUBLISHER_SOURCE = "sonic-events-host" +EVENTS_PUBLISHER_TAG = "liquid-cooling-leak" class HardwareChecker(HealthChecker): """ @@ -12,12 +15,15 @@ class HardwareChecker(HealthChecker): ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC' FAN_TABLE_NAME = 'FAN_INFO' PSU_TABLE_NAME = 'PSU_INFO' + LIQUID_COOLING_TABLE_NAME = 'LIQUID_COOLING_INFO' def __init__(self): HealthChecker.__init__(self) self._db = SonicV2Connector(use_unix_socket_path=True) self._db.connect(self._db.STATE_DB) + self.leaking_sensors = [] + def get_category(self): return 'Hardware' @@ -26,6 +32,7 @@ def check(self, config): self._check_asic_status(config) self._check_fan_status(config) self._check_psu_status(config) + self._check_liquid_cooling_status(config) def _check_asic_status(self, config): """ @@ -272,6 +279,7 @@ def _check_psu_status(self, config): def reset(self): self._info = {} + self.leaking_sensors = [] @classmethod def _ignore_check(cls, ignore_set, category, object_name, check_point): @@ -283,3 +291,60 @@ def _ignore_check(cls, ignore_set, category, object_name, check_point): elif '{}.{}'.format(object_name, check_point) in ignore_set: return True return False + + def publish_events(self, sensors, event_name): + if not sensors: + return + params = swsscommon.FieldValueMap() + events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) + for sensor in sensors: + params[event_name] = sensor + swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params) + swsscommon.events_deinit_publisher(events_handle) + + + def _check_liquid_cooling_status(self, config): + """ + Check liquid cooling status including: + 1. Check all leakage sensors are in good state + :param config: Health checker configuration + :return: + """ + if not config.include_devices or 'liquid_cooling' not in config.include_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.LIQUID_COOLING_TABLE_NAME + '|*') + if not keys: + self.set_object_not_ok('Liquid Cooling', 'Liquid Cooling', 'Failed to get liquid cooling information') + return + + new_leaking_sensors = [] + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('Liquid Cooling', key, 'Invalid key for LIQUID_COOLING_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + + data_dict = self._db.get_all(self._db.STATE_DB, key) + leak_status = data_dict.get('leak_status', None) + if leak_status is None or leak_status == 'N/A': + self.set_object_not_ok('Liquid Cooling', name, 'Failed to get leakage sensor status for {}'.format(name)) + continue + + if leak_status.lower() == 'yes' and name not in self.leaking_sensors: + self.leaking_sensors.append(name) + new_leaking_sensors.append(name) + self.set_object_not_ok('Liquid Cooling', name, 'Leakage sensor {} is leaking'.format(name)) + continue + + if leak_status.lower() == 'no': + self.set_object_ok('Liquid Cooling', name) + if name in self.leaking_sensors: + self.leaking_sensors.remove(name) + self.publish_events([name], "leaking sensor report recovered") + + self.publish_events(new_leaking_sensors, "sensor report leaking event") diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index 73e5cf3c1b4..1101eb11302 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -70,8 +70,6 @@ def __init__(self): self.load_critical_process_cache() - self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) - def get_expected_running_containers(self, feature_table): """Get a set of containers that are expected to running on SONiC @@ -342,7 +340,6 @@ def check(self, config): self.reset() self.check_by_monit(config) self.check_services(config) - swsscommon.events_deinit_publisher(self.events_handle) def _parse_supervisorctl_status(self, process_status): """Expected input: @@ -366,9 +363,11 @@ def _parse_supervisorctl_status(self, process_status): def publish_events(self, container_name, critical_process_list): params = swsscommon.FieldValueMap() params["ctr_name"] = container_name + events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) for process_name in critical_process_list: params["process_name"] = process_name - swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params) + swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params) + swsscommon.events_deinit_publisher(events_handle) def check_process_existence(self, container_name, critical_process_list, config, feature_table): """Check whether the process in the specified container is running or not. diff --git a/src/system-health/health_checker/system_health_monitoring_config.json b/src/system-health/health_checker/system_health_monitoring_config.json index 654d43d8109..12f16c850a2 100644 --- a/src/system-health/health_checker/system_health_monitoring_config.json +++ b/src/system-health/health_checker/system_health_monitoring_config.json @@ -2,6 +2,7 @@ "services_to_ignore": [], "devices_to_ignore": [], "user_defined_checkers": [], + "include_devices": [], "polling_interval": 60, "led_color": { "fault": "amber", diff --git a/src/system-health/health_checker/utils.py b/src/system-health/health_checker/utils.py index 00e7754e1ec..1a195c7c4c3 100644 --- a/src/system-health/health_checker/utils.py +++ b/src/system-health/health_checker/utils.py @@ -1,6 +1,5 @@ import subprocess - def run_command(command): """ Utility function to run an shell command and return the output. diff --git a/src/system-health/tests/system_health_monitoring_config.json b/src/system-health/tests/system_health_monitoring_config.json index 7284be7d163..487138cbf5a 100644 --- a/src/system-health/tests/system_health_monitoring_config.json +++ b/src/system-health/tests/system_health_monitoring_config.json @@ -2,6 +2,7 @@ "services_to_ignore": ["dummy_service"], "devices_to_ignore": ["psu.voltage"], "user_defined_checkers": [], + "include_devices": ["liquid_cooling"], "polling_interval": 60, "led_color": { "fault": "orange", diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 17997184490..620a6ca38a4 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -471,9 +471,37 @@ def test_hardware_checker(): } }) + MockConnector.data.update({ + 'LIQUID_COOLING_INFO|liquid_cooling_1': { + 'leak_status': 'Yes', + 'leak_sensor_name': 'liquid_cooling_1' + }, + 'LIQUID_COOLING_INFO|liquid_cooling_2': { + 'leak_status': 'No', + 'leak_sensor_name': 'liquid_cooling_2' + }, + 'LIQUID_COOLING_INFO|liquid_cooling_3': { + 'leak_status': 'Yes', + 'leak_sensor_name': 'liquid_cooling_3' + }, + 'LIQUID_COOLING_INFO|liquid_cooling_4': { + 'leak_status': 'No', + 'leak_sensor_name': 'liquid_cooling_4' + }, + 'LIQUID_COOLING_INFO|liquid_cooling_5': { + 'leak_status': 'Yes', + 'leak_sensor_name': 'liquid_cooling_5' + }, + 'LIQUID_COOLING_INFO|liquid_cooling_6': { + 'leak_status': 'No', + 'leak_sensor_name': 'liquid_cooling_6' + } + }) + checker = HardwareChecker() assert checker.get_category() == 'Hardware' config = Config() + config.include_devices = ['liquid_cooling'] checker.check(config) assert 'ASIC' in checker._info @@ -521,6 +549,24 @@ def test_hardware_checker(): assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'System power exceeds threshold but power_critical_threshold is invalid' + assert 'liquid_cooling_1' in checker._info + assert checker._info['liquid_cooling_1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'liquid_cooling_2' in checker._info + assert checker._info['liquid_cooling_2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'liquid_cooling_3' in checker._info + assert checker._info['liquid_cooling_3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'liquid_cooling_4' in checker._info + assert checker._info['liquid_cooling_4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'liquid_cooling_5' in checker._info + assert checker._info['liquid_cooling_5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'liquid_cooling_6' in checker._info + assert checker._info['liquid_cooling_6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + def test_config(): config = Config() @@ -532,6 +578,7 @@ def test_config(): assert 'dummy_service' in config.ignore_services assert 'psu.voltage' in config.ignore_devices assert len(config.user_defined_checkers) == 0 + assert 'liquid_cooling' in config.include_devices assert config.get_led_color('fault') == 'orange' assert config.get_led_color('normal') == 'green' @@ -543,6 +590,7 @@ def test_config(): assert not config.ignore_devices assert not config.user_defined_checkers assert not config.config_data + assert not config.include_devices assert config.get_led_color('fault') == 'red' assert config.get_led_color('normal') == 'green'