From c9df87942960e275ca2679d6efeb5761b0cf083d Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Mon, 15 Jun 2020 11:57:06 +0800 Subject: [PATCH 1/2] Regression test cases for system-health feature --- tests/system_health/__init__.py | 0 tests/system_health/device_mocker.py | 62 ++++ tests/system_health/files/device_check.json | 12 + tests/system_health/files/external_check.json | 12 + .../files/ignore_asic_check.json | 12 + .../files/ignore_device_check.json | 12 + .../system_health/files/ignore_fan_check.json | 12 + .../system_health/files/ignore_psu_check.json | 12 + .../files/mock_valid_external_checker.txt | 3 + tests/system_health/mellanox/__init__.py | 0 .../mellanox/mellanox_device_mocker.py | 131 +++++++ tests/system_health/test_system_health.py | 348 ++++++++++++++++++ 12 files changed, 616 insertions(+) create mode 100644 tests/system_health/__init__.py create mode 100644 tests/system_health/device_mocker.py create mode 100644 tests/system_health/files/device_check.json create mode 100644 tests/system_health/files/external_check.json create mode 100644 tests/system_health/files/ignore_asic_check.json create mode 100644 tests/system_health/files/ignore_device_check.json create mode 100644 tests/system_health/files/ignore_fan_check.json create mode 100644 tests/system_health/files/ignore_psu_check.json create mode 100644 tests/system_health/files/mock_valid_external_checker.txt create mode 100644 tests/system_health/mellanox/__init__.py create mode 100644 tests/system_health/mellanox/mellanox_device_mocker.py create mode 100644 tests/system_health/test_system_health.py diff --git a/tests/system_health/__init__.py b/tests/system_health/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/system_health/device_mocker.py b/tests/system_health/device_mocker.py new file mode 100644 index 00000000000..61243c5bbef --- /dev/null +++ b/tests/system_health/device_mocker.py @@ -0,0 +1,62 @@ +import os +import pytest +import sys + + +class DeviceMocker: + def deinit(self): + pass + + def mock_fan_presence(self, status): + return False, None + + def mock_fan_status(self, status): + return False, None + + def mock_fan_speed(self, good): + return False, None + + def mock_asic_temperature(self, good): + return False + + def mock_psu_presence(self, status): + return False, None + + def mock_psu_status(self, status): + return False, None + + def mock_psu_temperature(self, good): + return False, None + + def mock_psu_voltage(self, good): + return False, None + + +@pytest.fixture +def device_mocker_factory(): + """ + Fixture for system health data mocker factory. + :return: A function for creating system health related data mocker. + """ + mockers = [] + + def _create_mocker(dut): + """ + Create vendor specified mocker object by mocker name. + :param dut: DUT object representing a SONiC switch under test. + :return: Created mocker instance. + """ + platform = dut.facts['platform'] + mocker_object = None + if 'mlnx' in platform: + from .mellanox.mellanox_device_mocker import MellanoxDeviceMocker + mocker_object = MellanoxDeviceMocker(dut) + mockers.append(mocker_object) + else: + pytest.skip("No mocker defined for this platform %s") + return mocker_object + + yield _create_mocker + + for m in mockers: + m.deinit() diff --git a/tests/system_health/files/device_check.json b/tests/system_health/files/device_check.json new file mode 100644 index 00000000000..6912d5d533e --- /dev/null +++ b/tests/system_health/files/device_check.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": [], + "external_checkers": [], + "polling_interval": 10, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} diff --git a/tests/system_health/files/external_check.json b/tests/system_health/files/external_check.json new file mode 100644 index 00000000000..f82f08c23ab --- /dev/null +++ b/tests/system_health/files/external_check.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": [], + "external_checkers": ["cat /tmp/mock_valid_external_checker.txt"], + "polling_interval": 10, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} diff --git a/tests/system_health/files/ignore_asic_check.json b/tests/system_health/files/ignore_asic_check.json new file mode 100644 index 00000000000..35404587d06 --- /dev/null +++ b/tests/system_health/files/ignore_asic_check.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": ["asic"], + "external_checkers": [], + "polling_interval": 10, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} diff --git a/tests/system_health/files/ignore_device_check.json b/tests/system_health/files/ignore_device_check.json new file mode 100644 index 00000000000..2f48cfac4a8 --- /dev/null +++ b/tests/system_health/files/ignore_device_check.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": ["asic","fan","psu"], + "external_checkers": [], + "polling_interval": 10, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} diff --git a/tests/system_health/files/ignore_fan_check.json b/tests/system_health/files/ignore_fan_check.json new file mode 100644 index 00000000000..15dcdd2e428 --- /dev/null +++ b/tests/system_health/files/ignore_fan_check.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": ["fan"], + "external_checkers": [], + "polling_interval": 10, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} diff --git a/tests/system_health/files/ignore_psu_check.json b/tests/system_health/files/ignore_psu_check.json new file mode 100644 index 00000000000..7ce38828a6a --- /dev/null +++ b/tests/system_health/files/ignore_psu_check.json @@ -0,0 +1,12 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": ["psu"], + "external_checkers": [], + "polling_interval": 10, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + }, + "boot_timeout": 300 +} diff --git a/tests/system_health/files/mock_valid_external_checker.txt b/tests/system_health/files/mock_valid_external_checker.txt new file mode 100644 index 00000000000..ec76c681af9 --- /dev/null +++ b/tests/system_health/files/mock_valid_external_checker.txt @@ -0,0 +1,3 @@ +ExternalCategory +ExternalService:Service is not working +ExternalDevice:Device is broken \ No newline at end of file diff --git a/tests/system_health/mellanox/__init__.py b/tests/system_health/mellanox/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/system_health/mellanox/mellanox_device_mocker.py b/tests/system_health/mellanox/mellanox_device_mocker.py new file mode 100644 index 00000000000..756a16ca948 --- /dev/null +++ b/tests/system_health/mellanox/mellanox_device_mocker.py @@ -0,0 +1,131 @@ +from ..device_mocker import DeviceMocker +from common.mellanox_data import SWITCH_MODELS +from platform_tests.mellanox.mellanox_thermal_control_test_helper import MockerHelper, FanDrawerData, FanData, \ + FAN_NAMING_RULE + + +class AsicData(object): + TEMPERATURE_FILE = '/run/hw-management/thermal/asic' + THRESHOLD_FILE = '/run/hw-management/thermal/mlxsw/temp_trip_hot' + + def __init__(self, mock_helper): + self.helper = mock_helper + + def mock_asic_temperature(self, value): + self.helper.mock_value(AsicData.TEMPERATURE_FILE, str(value)) + + def get_asic_temperature_threshold(self): + value = self.helper.read_value(AsicData.THRESHOLD_FILE) + return int(value) + + +class PsuData(object): + PSU_STATUS_FILE = '/run/hw-management/thermal/psu{}_status' + PSU_POWER_STATUS_FILE = '/run/hw-management/thermal/psu{}_pwr_status' + PSU_TEMPERATURE_FILE = '/run/hw-management/thermal/psu{}_temp' + PSU_TEMP_THRESHOLD_FILE = '/run/hw-management/thermal/psu{}_temp_max' + + def __init__(self, mock_helper, index): + self.helper = mock_helper + self.index = index + self.name = 'PSU {}'.format(self.index) + power_status_file = PsuData.PSU_POWER_STATUS_FILE.format(index) + if self.helper._file_exist(power_status_file): + self.power_on = True + else: + self.power_on = False + + def mock_presence(self, status): + value = 1 if status else 0 + presence_file = PsuData.PSU_STATUS_FILE.format(self.index) + self.helper.mock_value(presence_file, str(value)) + + def mock_status(self, status): + value = 1 if status else 0 + power_status_file = PsuData.PSU_POWER_STATUS_FILE.format(self.index) + self.helper.mock_value(power_status_file, str(value)) + + def mock_temperature(self, value): + temperature_file = PsuData.PSU_TEMPERATURE_FILE.format(self.index) + self.helper.mock_value(temperature_file, str(value)) + + def get_psu_temperature_threshold(self): + threshold_file = PsuData.PSU_TEMP_THRESHOLD_FILE.format(self.index) + value = self.helper.read_value(threshold_file) + return int(value) + + +class MellanoxDeviceMocker(DeviceMocker): + TARGET_SPEED_VALUE = 60 + SPEED_TOLERANCE = 20 + PSU_NUM = 2 + + def __init__(self, dut): + self.mock_helper = MockerHelper(dut) + self.asic_data = AsicData(self.mock_helper) + naming_rule = FAN_NAMING_RULE['fan'] + self.fan_drawer_data = FanDrawerData(self.mock_helper, naming_rule, 1) + self.fan_data = FanData(self.mock_helper, naming_rule, 1) + + for i in range(MellanoxDeviceMocker.PSU_NUM): + self.psu_data = PsuData(self.mock_helper, i + 1) + if self.psu_data.power_on: + break + + def deinit(self): + self.mock_helper.deinit() + + def mock_fan_presence(self, status): + dut_hwsku = self.mock_helper.dut.facts["hwsku"] + always_present = not SWITCH_MODELS[dut_hwsku]['fans']['hot_swappable'] + if always_present: + return False, None + + value = 1 if status else 0 + self.fan_drawer_data.mock_presence(value) + return True, self.fan_data.name + + def mock_fan_status(self, status): + value = 0 if status else 1 + self.fan_data.mock_status(value) + return True, self.fan_data.name + + def mock_fan_speed(self, good): + target_speed = self.fan_data.get_target_speed() + if good: + actual_speed = target_speed + else: + actual_speed = target_speed * (5 + ((MellanoxDeviceMocker.SPEED_TOLERANCE + 1) / float(100))) + actual_speed = int(actual_speed) + self.fan_data.mock_speed(actual_speed) + return True, self.fan_data.name + + def mock_asic_temperature(self, good): + threshold = self.asic_data.get_asic_temperature_threshold() + if good: + value = threshold - 1000 + else: + value = threshold + 1000 + self.asic_data.mock_asic_temperature(value) + return True + + def mock_psu_presence(self, status): + self.psu_data.mock_presence(1 if status else 0) + return True, self.psu_data.name + + def mock_psu_status(self, status): + self.psu_data.mock_status(1 if status else 0) + return True, self.psu_data.name + + def mock_psu_temperature(self, good): + threshold = self.psu_data.get_psu_temperature_threshold() + if good: + value = threshold - 1000 + else: + value = threshold + 1000 + self.psu_data.mock_temperature(value) + return True, self.psu_data.name + + def mock_psu_voltage(self, good): + # Not Supported for now + return False, None diff --git a/tests/system_health/test_system_health.py b/tests/system_health/test_system_health.py new file mode 100644 index 00000000000..f1116f77269 --- /dev/null +++ b/tests/system_health/test_system_health.py @@ -0,0 +1,348 @@ +import json +import logging +import os +import pytest +import time +from common.utilities import wait_until +from device_mocker import device_mocker_factory + +HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO' + +BASE_DIR = os.path.dirname(os.path.realpath(__file__)) +FILES_DIR = os.path.join(BASE_DIR, 'files') +DUT_CONFIG_FILE = '/usr/share/sonic/device/{}/system_health_monitoring_config.json' +DUT_CONFIG_BACKUP_FILE = '/usr/share/sonic/device/{}/system_health_monitoring_config.json.bak' +DEVICE_CHECK_CONFIG_FILE = 'device_check.json' +EXTERNAL_CHECK_CONFIG_FILE = 'external_check.json' +IGNORE_ASIC_CHECK_CONFIG_FILE = 'ignore_asic_check.json' +IGNORE_FAN_CHECK_CONFIG_FILE = 'ignore_fan_check.json' +IGNORE_PSU_CHECK_CONFIG_FILE = 'ignore_psu_check.json' +IGNORE_DEVICE_CHECK_CONFIG_FILE = 'ignore_device_check.json' +EXTERNAL_CHECKER_MOCK_FILE = 'mock_valid_external_checker.txt' + +DEFAULT_BOOT_TIMEOUT = 300 +DEFAULT_INTERVAL = 60 +FAST_INTERVAL = 10 +THERMAL_CHECK_INTERVAL = 70 +PSU_CHECK_INTERVAL = FAST_INTERVAL + 5 +STATE_DB = 6 + +SERVICE_EXPECT_STATUS_DICT = { + 'System': 'Running', + 'Process': 'Running', + 'Filesystem': 'Accessible' +} +SUMMARY_OK = 'OK' +SUMMARY_NOT_OK = 'Not OK' + +EXPECT_FAN_MISSING = '{} is missing' +EXPECT_FAN_BROKEN = '{} is broken' +EXPECT_FAN_INVALID_SPEED = '{} speed is out of range' +EXPECT_ASIC_HOT = 'ASIC temperature is too hot' +EXPECT_PSU_MISSING = '{} is missing or not available' +EXPECT_PSU_NO_POWER = '{} is out of power' +EXPECT_PSU_HOT = '{} temperature is too hot' +EXPECT_PSU_INVALID_VOLTAGE = '{} voltage is out of range' + + +def test_service_checker(duthost): + wait_system_health_boot_up(duthost) + with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)): + cmd = "monit summary -B" + logging.info('Getting output for command {}'.format(cmd)) + output = duthost.shell(cmd) + content = output['stdout'].strip() + lines = content.splitlines() + status_begin = lines[1].find('Status') + type_begin = lines[1].find('Type') + expect_error_dict = {} + logging.info('Getting service status') + for line in lines[2:]: + service_name = line[0:status_begin].strip() + status = line[status_begin:type_begin].strip() + service_type = line[type_begin:].strip() + assert service_type in SERVICE_EXPECT_STATUS_DICT, 'Unknown service type {}'.format(service_type) + expect_status = SERVICE_EXPECT_STATUS_DICT[service_type] + if expect_status != status: + expect_error_dict[service_name] = '{} is not {}'.format(service_name, expect_status) + + logging.info('Waiting {} seconds for healthd to work'.format(DEFAULT_INTERVAL)) + time.sleep(DEFAULT_INTERVAL) + if expect_error_dict: + logging.info('Verify data in redis') + for name, error in expect_error_dict.items(): + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, name) + assert value == error, 'Expect error {}, got {}'.format(error, value) + + summary = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'summary') + expect_summary = SUMMARY_OK if not expect_error_dict else SUMMARY_NOT_OK + assert summary == expect_summary, 'Expect summary {}, got {}'.format(expect_summary, summary) + + +def test_device_checker(duthost, device_mocker_factory): + device_mocker = device_mocker_factory(duthost) + wait_system_health_boot_up(duthost) + with ConfigFileContext(duthost, os.path.join(FILES_DIR, DEVICE_CHECK_CONFIG_FILE)): + time.sleep(DEFAULT_INTERVAL) + mock_result, fan_name = device_mocker.mock_fan_speed(False) + expect_value = EXPECT_FAN_INVALID_SPEED.format(fan_name) + if mock_result: + logging.info('Mocked invalid fan speed for {}, waiting {} seconds for it to take effect'.format(fan_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert value and expect_value in value, 'Mock fan invalid speed, expect {}, but got {}'.format(expect_value, + value) + mock_result, fan_name = device_mocker.mock_fan_speed(True) + if mock_result: + logging.info('Mocked valid fan speed for {}, waiting {} seconds for it to take effect'.format(fan_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert not value or expect_value not in value, 'Mock fan valid speed, expect {}, ' \ + 'but it still report invalid speed' + + mock_result, fan_name = device_mocker.mock_fan_presence(False) + expect_value = EXPECT_FAN_MISSING.format(fan_name) + if mock_result: + logging.info('Mocked fan absence {}, waiting {} seconds for it to take effect'.format(fan_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert value and value == expect_value, 'Mock fan absence, expect {}, but got {}'.format(expect_value, + value) + + mock_result, fan_name = device_mocker.mock_fan_presence(True) + if mock_result: + logging.info('Mocked fan presence for {}, waiting {} seconds for it to take effect'.format(fan_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert not value or value != expect_value, 'Mock fan presence, but it still report absence' + + mock_result, fan_name = device_mocker.mock_fan_status(False) + expect_value = EXPECT_FAN_BROKEN.format(fan_name) + if mock_result: + logging.info('Mocked fan broken for {}, waiting {} seconds for it to take effect'.format(fan_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert value and value == expect_value, 'Mock fan broken, expect {}, but got {}'.format(expect_value, + value) + mock_result, fan_name = device_mocker.mock_fan_status(True) + if mock_result: + logging.info('Mocked fan good for {}, waiting {} seconds for it to take effect'.format(fan_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert not value or value != expect_value, 'Mock fan normal, but it still report broken' + + mock_result = device_mocker.mock_asic_temperature(False) + expect_value = EXPECT_ASIC_HOT + if mock_result: + logging.info('Mocked ASIC hot, waiting {} seconds for it to take effect'.format(THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC') + assert value and expect_value in value, 'Mock ASIC temperature hot, expect {}, but got {}'.format( + expect_value, + value) + + mock_result = device_mocker.mock_asic_temperature(True) + if mock_result: + logging.info('Mocked ASIC cold, waiting {} seconds for it to take effect'.format(THERMAL_CHECK_INTERVAL)) + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC') + assert not value or expect_value not in value, 'Mock ASIC temperature normal, but it is still hot' + + mock_result, psu_name = device_mocker.mock_psu_presence(False) + expect_value = EXPECT_PSU_MISSING.format(psu_name) + if mock_result: + logging.info('Mocked PSU absence for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert value and expect_value == value, 'Mock PSU absence, expect {}, but got {}'.format(expect_value, + value) + + mock_result, psu_name = device_mocker.mock_psu_presence(True) + if mock_result: + logging.info('Mocked PSU presence for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert not value or expect_value != value, 'Mock PSU presence, but it is still absence' + + mock_result, psu_name = device_mocker.mock_psu_status(False) + expect_value = EXPECT_PSU_NO_POWER.format(psu_name) + if mock_result: + logging.info('Mocked PSU no power for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert value and expect_value == value, 'Mock PSU no power, expect {}, but got {}'.format(expect_value, + value) + + mock_result, psu_name = device_mocker.mock_psu_status(True) + if mock_result: + logging.info('Mocked PSU good power for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert not value or expect_value != value, 'Mock PSU power good, but it is still out of power' + + mock_result, psu_name = device_mocker.mock_psu_temperature(False) + expect_value = EXPECT_PSU_HOT.format(psu_name) + if mock_result: + logging.info('Mocked PSU hot for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert value and expect_value in value, 'Mock PSU hot, expect {}, but got {}'.format(expect_value, + value) + + mock_result, psu_name = device_mocker.mock_psu_temperature(True) + if mock_result: + logging.info('Mocked PSU cold for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert not value or expect_value not in value, 'Mock PSU cold, but it is still hot' + + mock_result, psu_name = device_mocker.mock_psu_voltage(False) + expect_value = EXPECT_PSU_INVALID_VOLTAGE.format(psu_name) + if mock_result: + logging.info('Mocked PSU bad voltage for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert value and expect_value in value, 'Mock PSU invalid voltage, expect {}, but got {}'.format( + expect_value, + value) + + mock_result, psu_name = device_mocker.mock_psu_voltage(True) + if mock_result: + logging.info('Mocked PSU good voltage for {}, waiting {} seconds for it to take effect'.format(psu_name, + THERMAL_CHECK_INTERVAL)) + time.sleep(FAST_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert not value or expect_value not in value, 'Mock PSU good voltage, but it is still invalid' + + +def test_external_checker(duthost): + wait_system_health_boot_up(duthost) + with ConfigFileContext(duthost, os.path.join(FILES_DIR, EXTERNAL_CHECK_CONFIG_FILE)): + duthost.copy(src=os.path.join(FILES_DIR, EXTERNAL_CHECKER_MOCK_FILE), + dest=os.path.join('/tmp', EXTERNAL_CHECKER_MOCK_FILE)) + time.sleep(DEFAULT_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ExternalService') + assert value == 'Service is not working', 'External checker does not work, value={}'.format(value) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ExternalDevice') + assert value == 'Device is broken', 'External checker does not work, value={}'.format(value) + + +def test_system_health_config(duthost, device_mocker_factory): + device_mocker = device_mocker_factory(duthost) + wait_system_health_boot_up(duthost) + logging.info('Ignore fan check, verify there is no error information about fan') + with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_FAN_CHECK_CONFIG_FILE)): + time.sleep(DEFAULT_INTERVAL) + mock_result, fan_name = device_mocker.mock_fan_presence(False) + expect_value = EXPECT_FAN_MISSING.format(fan_name) + if mock_result: + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) + assert not value or expect_value != value, 'Fan check is still performed after it ' \ + 'is configured to be ignored' + + logging.info('Ignore ASIC check, verify there is no error information about ASIC') + with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_ASIC_CHECK_CONFIG_FILE)): + time.sleep(FAST_INTERVAL) + mock_result = device_mocker.mock_asic_temperature(False) + expect_value = EXPECT_ASIC_HOT + if mock_result: + time.sleep(THERMAL_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC') + assert not value or expect_value not in value, 'ASIC check is still performed after it ' \ + 'is configured to be ignored' + + logging.info('Ignore PSU check, verify there is no error information about psu') + with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_PSU_CHECK_CONFIG_FILE)): + time.sleep(FAST_INTERVAL) + mock_result, psu_name = device_mocker.mock_psu_presence(False) + expect_value = EXPECT_PSU_MISSING.format(psu_name) + if mock_result: + time.sleep(PSU_CHECK_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) + assert not value or expect_value != value, 'PSU check is still performed after it ' \ + 'is configured to be ignored' + + +def wait_system_health_boot_up(duthost): + boot_timeout = get_system_health_config(duthost, 'boot_timeout', DEFAULT_BOOT_TIMEOUT) + assert wait_until(boot_timeout, 10, redis_table_exists, duthost, STATE_DB, HEALTH_TABLE_NAME), \ + 'System health service is not working' + + +def get_system_health_config(duthost, key, default): + try: + platform_str = duthost.facts['platform'] + config_file = DUT_CONFIG_FILE.format(platform_str) + cmd = 'cat {}'.format(config_file) + output = duthost.shell(cmd) + content = output['stdout'].strip() + json_obj = json.loads(content) + return json_obj[key] + except: + return default + + +def redis_table_exists(duthost, db_id, key): + cmd = 'redis-cli --raw -n {} EXISTS \"{}\"'.format(db_id, key) + logging.info('Checking if table exists in redis with cmd: {}'.format(cmd)) + output = duthost.shell(cmd) + content = output['stdout'].strip() + return content != '0' + + +def redis_get_field_value(duthost, db_id, key, field_name): + cmd = 'redis-cli --raw -n {} HGET \"{}\" \"{}\"'.format(db_id, key, field_name) + logging.info('Getting field value from redis with cmd: {}'.format(cmd)) + output = duthost.shell(cmd) + content = output['stdout'].strip() + return content + + +class ConfigFileContext: + """ + Context class to help replace system health policy file and restore it automatically. + """ + + def __init__(self, dut, src): + """ + Constructor of ConfigFileContext. + :param dut: DUT object representing a SONiC switch under test. + :param src: Local config file path. + """ + self.dut = dut + self.src = src + platform_str = dut.facts['platform'] + self.origin_config = DUT_CONFIG_FILE.format(platform_str) + self.backup_config = DUT_CONFIG_BACKUP_FILE.format(platform_str) + + def __enter__(self): + """ + Back up original system health config file and replace it with the given one. + :return: + """ + self.dut.command('mv -f {} {}'.format(self.origin_config, self.backup_config)) + self.dut.copy(src=os.path.join(FILES_DIR, self.src), dest=self.origin_config) + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Restore original system health config file. + :param exc_type: Not used. + :param exc_val: Not used. + :param exc_tb: Not used. + :return: + """ + self.dut.command('mv -f {} {}'.format(self.backup_config, self.origin_config)) From f6da76617de0c09d7eda4a3641c5a2b788d2cb69 Mon Sep 17 00:00:00 2001 From: Shlomi Bitton Date: Sun, 12 Jul 2020 19:17:05 +0300 Subject: [PATCH 2/2] Fix imports and change 'platform' to 'asic_type' for identifying Signed-off-by: Shlomi Bitton --- tests/system_health/device_mocker.py | 4 ++-- tests/system_health/mellanox/mellanox_device_mocker.py | 4 ++-- tests/system_health/test_system_health.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/system_health/device_mocker.py b/tests/system_health/device_mocker.py index 61243c5bbef..26719c18ee3 100644 --- a/tests/system_health/device_mocker.py +++ b/tests/system_health/device_mocker.py @@ -46,9 +46,9 @@ def _create_mocker(dut): :param dut: DUT object representing a SONiC switch under test. :return: Created mocker instance. """ - platform = dut.facts['platform'] + asic = dut.facts['asic_type'] mocker_object = None - if 'mlnx' in platform: + if 'mellanox' in asic: from .mellanox.mellanox_device_mocker import MellanoxDeviceMocker mocker_object = MellanoxDeviceMocker(dut) mockers.append(mocker_object) diff --git a/tests/system_health/mellanox/mellanox_device_mocker.py b/tests/system_health/mellanox/mellanox_device_mocker.py index 756a16ca948..12aac4ca1ea 100644 --- a/tests/system_health/mellanox/mellanox_device_mocker.py +++ b/tests/system_health/mellanox/mellanox_device_mocker.py @@ -1,6 +1,6 @@ from ..device_mocker import DeviceMocker -from common.mellanox_data import SWITCH_MODELS -from platform_tests.mellanox.mellanox_thermal_control_test_helper import MockerHelper, FanDrawerData, FanData, \ +from tests.common.mellanox_data import SWITCH_MODELS +from tests.platform_tests.mellanox.mellanox_thermal_control_test_helper import MockerHelper, FanDrawerData, FanData, \ FAN_NAMING_RULE diff --git a/tests/system_health/test_system_health.py b/tests/system_health/test_system_health.py index f1116f77269..5e3ad023bc8 100644 --- a/tests/system_health/test_system_health.py +++ b/tests/system_health/test_system_health.py @@ -3,7 +3,7 @@ import os import pytest import time -from common.utilities import wait_until +from tests.common.utilities import wait_until from device_mocker import device_mocker_factory HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO'