From 7fda7f972cd9be41dde4cd9ca2ad956d1f39b48e Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 20 Oct 2021 16:10:32 +0800 Subject: [PATCH 1/2] Adjust test cases of system health Change-Id: I01710a47112b483fbedc5d0ff83b70bed5c44c70 --- tests/system_health/test_system_health.py | 64 +++++++++++++++++------ 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/tests/system_health/test_system_health.py b/tests/system_health/test_system_health.py index 9ad6fba2720..37b18f6a244 100644 --- a/tests/system_health/test_system_health.py +++ b/tests/system_health/test_system_health.py @@ -2,6 +2,7 @@ import logging import os import pytest +import random import time from pkg_resources import parse_version from tests.common.utilities import wait_until @@ -67,23 +68,12 @@ def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname): duthost = duthosts[enum_rand_one_per_hwsku_hostname] wait_system_health_boot_up(duthost) with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)): - cmd = "monit summary -B" - logger.info('Getting output for command {}'.format(cmd)) - output = duthost.shell(cmd) - content = output['stdout'].strip() - lines = content.splitlines() - status_begin = lines[1].find('Status') - type_begin = lines[1].find('Type') + processes_status = duthost.all_critical_process_status() expect_error_dict = {} - logger.info('Getting service status') - for line in lines[2:]: - service_name = line[0:status_begin].strip() - status = line[status_begin:type_begin].strip() - service_type = line[type_begin:].strip() - assert service_type in SERVICE_EXPECT_STATUS_DICT, 'Unknown service type {}'.format(service_type) - expect_status = SERVICE_EXPECT_STATUS_DICT[service_type] - if expect_status != status: - expect_error_dict[service_name] = '{} is not {}'.format(service_name, expect_status) + for container_name, processes in processes_status.items(): + if processes["status"] is False or len(processes["exited_critical_process"]) > 0: + for process_name in processes["exited_critical_process"]: + expect_error_dict[process_name] = '{}:{} is not running'.format(container_name, process_name) logger.info('Waiting {} seconds for healthd to work'.format(DEFAULT_INTERVAL)) time.sleep(DEFAULT_INTERVAL) @@ -98,6 +88,33 @@ def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname): assert summary == expect_summary, 'Expect summary {}, got {}'.format(expect_summary, summary) +@pytest.mark.disable_loganalyzer +def test_service_checker_with_process_exit(duthosts, enum_rand_one_per_hwsku_hostname): + duthost = duthosts[enum_rand_one_per_hwsku_hostname] + wait_system_health_boot_up(duthost) + with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)): + processes_status = duthost.all_critical_process_status() + containers = list(processes_status.keys()) + logging.info('Runnig containers: {}'.format(containers)) + random.shuffle(containers) + for container in containers: + if container == 'syncd' or container == 'database': + continue + + running_critical_process = processes_status[container]['running_critical_process'] + if not running_critical_process: + continue + + critical_process = random.sample(running_critical_process, 1)[0] + with ProcessExitContext(duthost, container, critical_process): + time.sleep(DEFAULT_INTERVAL) + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, '{}:{}'.format(container, critical_process)) + assert value == "'{}' is not running".format(critical_process), 'Got value {}'.format(value) + summary = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'summary') + assert summary == SUMMARY_NOT_OK + break + + @pytest.mark.disable_loganalyzer def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocker_factory, disable_thermal_policy): duthost = duthosts[enum_rand_one_per_hwsku_hostname] @@ -348,3 +365,18 @@ def __exit__(self, exc_type, exc_val, exc_tb): :return: """ self.dut.command('mv -f {} {}'.format(self.backup_config, self.origin_config)) + + +class ProcessExitContext: + def __init__(self, dut, container_name, process_name): + self.dut = dut + self.container_name = container_name + self.process_name = process_name + + def __enter__(self): + logging.info('Stopping {}:{}'.format(self.container_name, self.process_name)) + self.dut.command('docker exec -it {} bash -c "supervisorctl stop {}"'.format(self.container_name, self.process_name)) + + def __exit__(self, exc_type, exc_val, exc_tb): + logging.info('Starting {}:{}'.format(self.container_name, self.process_name)) + self.dut.command('docker exec -it {} bash -c "supervisorctl start {}"'.format(self.container_name, self.process_name)) From 51e4f9949fd9a9dc9620d6a036eee8c5957a15ff Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 21 Oct 2021 17:52:31 +0800 Subject: [PATCH 2/2] Fix review comment Change-Id: Idd1ca2133593c45ce18de0e82a40281ab557561d --- tests/system_health/test_system_health.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/system_health/test_system_health.py b/tests/system_health/test_system_health.py index 37b18f6a244..0bb60706381 100644 --- a/tests/system_health/test_system_health.py +++ b/tests/system_health/test_system_health.py @@ -5,6 +5,7 @@ import random import time from pkg_resources import parse_version +from tests.common import config_reload from tests.common.utilities import wait_until from tests.common.helpers.assertions import pytest_require from tests.platform_tests.thermal_control_test_helper import disable_thermal_policy @@ -64,6 +65,12 @@ def check_image_version(duthost): yield +@pytest.fixture(autouse=True, scope='module') +def config_reload_after_tests(duthost): + yield + config_reload(duthost) + + def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname): duthost = duthosts[enum_rand_one_per_hwsku_hostname] wait_system_health_boot_up(duthost) @@ -94,13 +101,10 @@ def test_service_checker_with_process_exit(duthosts, enum_rand_one_per_hwsku_hos wait_system_health_boot_up(duthost) with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)): processes_status = duthost.all_critical_process_status() - containers = list(processes_status.keys()) - logging.info('Runnig containers: {}'.format(containers)) + containers = [x for x in list(processes_status.keys()) if x != "syncd" and x !="database"] + logging.info('Test containers: {}'.format(containers)) random.shuffle(containers) for container in containers: - if container == 'syncd' or container == 'database': - continue - running_critical_process = processes_status[container]['running_critical_process'] if not running_critical_process: continue @@ -156,7 +160,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke time.sleep(THERMAL_CHECK_INTERVAL) value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) assert not value or fan_expect_value not in value, 'Mock fan valid speed, expect {}, but it still report invalid speed' - + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC') assert not value or asic_expect_value not in value, 'Mock ASIC normal temperature, but it is still overheated' @@ -175,7 +179,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) assert value and value == fan_expect_value, 'Mock fan absence, expect {}, but got {}'.format(fan_expect_value, value) - + value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) assert value and psu_expect_value == value, 'Mock PSU no power, expect {}, but got {}'.format(psu_expect_value, value) @@ -190,7 +194,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name) assert not value or value != fan_expect_value, 'Mock fan presence, but it still report absence' - + time.sleep(PSU_CHECK_INTERVAL) value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name) assert not value or psu_expect_value != value, 'Mock PSU power good, but it is still out of power'