Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 55 additions & 19 deletions tests/system_health/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import logging
import os
import pytest
import random
import time
from pkg_resources import parse_version
from tests.common import config_reload
from tests.common.utilities import wait_until
from tests.common.helpers.assertions import pytest_require
from tests.platform_tests.thermal_control_test_helper import disable_thermal_policy
Expand Down Expand Up @@ -63,27 +65,22 @@ def check_image_version(duthost):
yield


@pytest.fixture(autouse=True, scope='module')
def config_reload_after_tests(duthost):
yield
config_reload(duthost)


def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname):
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
wait_system_health_boot_up(duthost)
with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)):
cmd = "monit summary -B"
logger.info('Getting output for command {}'.format(cmd))
output = duthost.shell(cmd)
content = output['stdout'].strip()
lines = content.splitlines()
status_begin = lines[1].find('Status')
type_begin = lines[1].find('Type')
processes_status = duthost.all_critical_process_status()
expect_error_dict = {}
logger.info('Getting service status')
for line in lines[2:]:
service_name = line[0:status_begin].strip()
status = line[status_begin:type_begin].strip()
service_type = line[type_begin:].strip()
assert service_type in SERVICE_EXPECT_STATUS_DICT, 'Unknown service type {}'.format(service_type)
expect_status = SERVICE_EXPECT_STATUS_DICT[service_type]
if expect_status != status:
expect_error_dict[service_name] = '{} is not {}'.format(service_name, expect_status)
for container_name, processes in processes_status.items():
if processes["status"] is False or len(processes["exited_critical_process"]) > 0:
for process_name in processes["exited_critical_process"]:
expect_error_dict[process_name] = '{}:{} is not running'.format(container_name, process_name)

logger.info('Waiting {} seconds for healthd to work'.format(DEFAULT_INTERVAL))
time.sleep(DEFAULT_INTERVAL)
Expand All @@ -98,6 +95,30 @@ def test_service_checker(duthosts, enum_rand_one_per_hwsku_hostname):
assert summary == expect_summary, 'Expect summary {}, got {}'.format(expect_summary, summary)


@pytest.mark.disable_loganalyzer
def test_service_checker_with_process_exit(duthosts, enum_rand_one_per_hwsku_hostname):
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
wait_system_health_boot_up(duthost)
with ConfigFileContext(duthost, os.path.join(FILES_DIR, IGNORE_DEVICE_CHECK_CONFIG_FILE)):
processes_status = duthost.all_critical_process_status()
containers = [x for x in list(processes_status.keys()) if x != "syncd" and x !="database"]
logging.info('Test containers: {}'.format(containers))
random.shuffle(containers)
for container in containers:
running_critical_process = processes_status[container]['running_critical_process']
if not running_critical_process:
continue

critical_process = random.sample(running_critical_process, 1)[0]
with ProcessExitContext(duthost, container, critical_process):
time.sleep(DEFAULT_INTERVAL)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DEFAULT_INTERVAL

What is the expected max time for redis table to take effect? Is 60s too long?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

System-health poll critical process status every 60 seconds, so it has to wait 60 second here.

value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, '{}:{}'.format(container, critical_process))
assert value == "'{}' is not running".format(critical_process), 'Got value {}'.format(value)
summary = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'summary')
assert summary == SUMMARY_NOT_OK
break


@pytest.mark.disable_loganalyzer
def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocker_factory, disable_thermal_policy):
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
Expand Down Expand Up @@ -139,7 +160,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke
time.sleep(THERMAL_CHECK_INTERVAL)
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name)
assert not value or fan_expect_value not in value, 'Mock fan valid speed, expect {}, but it still report invalid speed'

value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, 'ASIC')
assert not value or asic_expect_value not in value, 'Mock ASIC normal temperature, but it is still overheated'

Expand All @@ -158,7 +179,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name)
assert value and value == fan_expect_value, 'Mock fan absence, expect {}, but got {}'.format(fan_expect_value,
value)

value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name)
assert value and psu_expect_value == value, 'Mock PSU no power, expect {}, but got {}'.format(psu_expect_value,
value)
Expand All @@ -173,7 +194,7 @@ def test_device_checker(duthosts, enum_rand_one_per_hwsku_hostname, device_mocke
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, fan_name)
assert not value or value != fan_expect_value, 'Mock fan presence, but it still report absence'


time.sleep(PSU_CHECK_INTERVAL)
value = redis_get_field_value(duthost, STATE_DB, HEALTH_TABLE_NAME, psu_name)
assert not value or psu_expect_value != value, 'Mock PSU power good, but it is still out of power'
Expand Down Expand Up @@ -348,3 +369,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
:return:
"""
self.dut.command('mv -f {} {}'.format(self.backup_config, self.origin_config))


class ProcessExitContext:
Copy link
Copy Markdown
Contributor

@qiluo-msft qiluo-msft Oct 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ProcessExitContext

Super concise and useful, thanks! #Closed

def __init__(self, dut, container_name, process_name):
self.dut = dut
self.container_name = container_name
self.process_name = process_name

def __enter__(self):
logging.info('Stopping {}:{}'.format(self.container_name, self.process_name))
self.dut.command('docker exec -it {} bash -c "supervisorctl stop {}"'.format(self.container_name, self.process_name))

def __exit__(self, exc_type, exc_val, exc_tb):
logging.info('Starting {}:{}'.format(self.container_name, self.process_name))
self.dut.command('docker exec -it {} bash -c "supervisorctl start {}"'.format(self.container_name, self.process_name))