diff --git a/tests/common/plugins/sanity_check/README.md b/tests/common/plugins/sanity_check/README.md index 06e9c85688e..dd5b23326e2 100644 --- a/tests/common/plugins/sanity_check/README.md +++ b/tests/common/plugins/sanity_check/README.md @@ -52,13 +52,13 @@ We can use keyword argument `check_items` to fine tune the items to be checked i * interfaces: Check the status of network interfaces. Please refer to sonic-mgmt/tests/common/plugins/sanity_check/constants::SUPPORTED_CHECK_ITEMS for the latest supported check items. -Value for `check_items` should be a tuple or list of strings. Each item in the tuple or list should be a string. The string can be name of the supported check items with optional prefix `+` or `-`. Unsupported check items will be ignored. +Value for `check_items` should be a tuple or list of strings. Each item in the tuple or list should be a string. The string can be name of the supported check items with optional prefix `+` or `-` or `_`. Unsupported check items will be ignored. -If a supported check item is prefixed with `-`, then this item will not be checked in sanity. For items with prefix `+` or without prefixes, the item should be included in the list of items to be checked in sanity. +If a supported check item is prefixed with `-` or `_`, then this item will not be checked in sanity. For items with prefix `+` or without prefixes, the item should be included in the list of items to be checked in sanity. With this design, we can extend the sanity check items in the future. By default, only a very basic set of sanity check is performed. For some test scripts that do not need some default sanity check items or need some extra sanity check items, we can use this syntax to tailor the check items that fit best for the current test script. -User can change check item list by passing parameter from command line --check_items="add remove string". Exmaple: --check_items="-services,+bgp" means do not check services, but add bgp to the check list. This parameter is not an absolute list, it is addition or subtraction from the existing list. +User can change check item list by passing parameter from command line --check_items="add remove string". Example: --check_items="_services,+bgp" means do not check services, but add bgp to the check list. This parameter is not an absolute list, it is addition or subtraction from the existing list. On command line "-" has special meaning. So, we need to prefix "_" to skip a check item. ## Log collecting If sanity check is to be performed, the script will also run some commands on the DUT to collect some basic information for debugging. Please refer to sonic-mgmt/tests/common/plugins/sanity_check/constants::PRINT_LOGS for the list of logs that will be collected. @@ -77,6 +77,15 @@ The sanity check plugin also supports pytest command line option `--allow_recove $ pytest -i inventory --host-pattern switch1-t0 --module-path ../ansible/library/ --testbed switch1-t0 --testbed-file testbed.csv --log-cli-level info test_something.py --allow_recover ``` +## Check item +The check items are defined in the `checks.py` module. In the original design, check item is defined as an ordinary function. All the dependent fixtures must be specified in the argument list of `sanity_check`. Then objects of the fixtures are passed to the check functions as arguments. However, this design has a limitation. Not all the sanity check dependent fixtures are supported on all topologies. On some topologies, sanity check may fail with getting those fixtures. +To resolve that issue, we have changed the design. Now the check items must be defined as fixtures. Then the check fixtures can be dynamically attached to test cases during run time. In the sanity check plugin, we can check the current testbed type or other conditions to decide whether or not to load certain check fixtures. + +### Check item implementation details +Each check fixture must use the factory design pattern to return a check function. Then we can delay execution of the various sanity checks after the sanity check items have been dynamically adjusted. + +Check fixture must be named with pattern `check_`. When a new check fixture is defined, its name must be added to the `__all__` list of the `checks.py` module. + ## Why check networking uptime? The sanity check may be performed right after the DUT is rebooted or config reload is performed. In this case, services and interfaces may not be ready yet and sanity check will fail unnecessarily. diff --git a/tests/common/plugins/sanity_check/__init__.py b/tests/common/plugins/sanity_check/__init__.py index b0ba37fc75b..b0fc853afa4 100644 --- a/tests/common/plugins/sanity_check/__init__.py +++ b/tests/common/plugins/sanity_check/__init__.py @@ -1,18 +1,30 @@ import logging -import random import copy import json import pytest -import constants -from checks import do_checks, print_logs -from recover import recover +from inspect import getmembers, isfunction +from collections import defaultdict + +from tests.common.plugins.sanity_check import constants +from tests.common.plugins.sanity_check import checks +from tests.common.plugins.sanity_check.checks import * +from tests.common.plugins.sanity_check.recover import recover from tests.common.helpers.assertions import pytest_assert as pt_assert +from tests.common.plugins.sanity_check.checks import check_monit + logger = logging.getLogger(__name__) +SUPPORTED_CHECKS = [member[0].replace('check_', '') for member in getmembers(checks, isfunction) + if member[0].startswith('check_')] + + +def _item2fixture(item): + return 'check_' + item + def _update_check_items(old_items, new_items, supported_items): """ @@ -25,7 +37,7 @@ def _update_check_items(old_items, new_items, supported_items): for new_item in new_items: if not new_item: continue - if new_item[0] == "-": # Remove default check item + if new_item[0] in ["_", "-"]: # Remove default check item new_item = new_item[1:] if new_item in updated_items: logger.info("Skip checking '%s'" % new_item) @@ -33,22 +45,44 @@ def _update_check_items(old_items, new_items, supported_items): else: # Add a check item if new_item[0] == "+": new_item = new_item[1:] - if new_item in supported_items: - logger.info("Add checking '%s'" % new_item) - updated_items.add(new_item) + if new_item in supported_items : + if new_item not in updated_items: + logger.info("Add checking '{}'".format(new_item)) + updated_items.add(new_item) else: - logger.warning("Unsupported sanity checking: '%s'" % new_item) + logger.warning('Check item "{}" no in supported check items: {}'.format(new_item, supported_items)) return updated_items +def print_logs(duthosts): + for dut in duthosts: + logger.info("Run commands to print logs, logs to be collected on {}:\n{}"\ + .format(dut.hostname, json.dumps(constants.PRINT_LOGS, indent=4))) + for cmd in constants.PRINT_LOGS.values(): + res = dut.shell(cmd, module_ignore_errors=True) + logger.info("cmd='%s', output:\n%s" % (cmd, json.dumps(res["stdout_lines"], indent=4))) + + +def do_checks(request, check_items): + check_results = [] + for item in check_items: + check_fixture = request.getfixturevalue(_item2fixture(item)) + results = check_fixture() + if results and isinstance(results, list): + check_results.extend(results) + elif results: + check_results.append(results) + return check_results + + @pytest.fixture(scope="module", autouse=True) def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): - logger.info("Start pre-test sanity check") + logger.info("Prepare pre-test sanity check") skip_sanity = False allow_recover = False recover_method = "adaptive" - check_items = set(copy.deepcopy(constants.DEFAULT_CHECK_ITEMS)) # Default check items + check_items = set(copy.deepcopy(SUPPORTED_CHECKS)) # Default check items post_check = False customized_sanity_check = None @@ -59,7 +93,8 @@ def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): break if customized_sanity_check: - logger.info("Process marker %s in script. m.args=%s, m.kwargs=%s" % (m.name, str(m.args), str(m.kwargs))) + logger.info("Process marker {} in script. m.args={}, m.kwargs={}" + .format(customized_sanity_check.name, customized_sanity_check.args, customized_sanity_check.kwargs)) skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False) allow_recover = customized_sanity_check.kwargs.get("allow_recover", False) recover_method = customized_sanity_check.kwargs.get("recover_method", "adaptive") @@ -70,17 +105,23 @@ def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): check_items = _update_check_items(check_items, customized_sanity_check.kwargs.get("check_items", []), - constants.SUPPORTED_CHECK_ITEMS) + SUPPORTED_CHECKS) post_check = customized_sanity_check.kwargs.get("post_check", False) if request.config.option.skip_sanity: skip_sanity = True + if skip_sanity: + logger.info("Skip sanity check according to command line argument or configuration of test script.") + yield + return + if request.config.option.allow_recover: allow_recover = True - items = request.config.getoption("--check_items") - if items: - items_array=str(items).split(',') - check_items = _update_check_items(check_items, items_array, constants.SUPPORTED_CHECK_ITEMS) + + cli_items = request.config.getoption("--check_items") + if cli_items: + cli_items_list=str(cli_items).split(',') + check_items = _update_check_items(check_items, cli_items_list, SUPPORTED_CHECKS) # ignore BGP check for particular topology type if tbinfo['topo']['type'] == 'ptf' and 'bgp' in check_items: @@ -89,70 +130,60 @@ def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo): logger.info("Sanity check settings: skip_sanity=%s, check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s" % \ (skip_sanity, check_items, allow_recover, recover_method, post_check)) - if skip_sanity: - logger.info("Skip sanity check according to command line argument or configuration of test script.") - yield - return - if not check_items: logger.info("No sanity check item is specified, no pre-test sanity check") yield logger.info("No sanity check item is specified, no post-test sanity check") return - print_logs(duthosts, constants.PRINT_LOGS) - check_results = do_checks(duthosts, check_items) - logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check results: !!!!!!!!!!!!!!!!\n%s" % \ - json.dumps(check_results, indent=4)) - - pre_sanity_failed = False - for a_dutname, a_dut_results in check_results.items(): - if any([result["failed"] for result in a_dut_results]): - pre_sanity_failed = True - if not allow_recover: - failed_items = json.dumps([result for result in a_dut_results if result["failed"]], indent=4) - logger.error("On {}, failed pre-sanity check items with allow_recover=False:\n{}".format(a_dutname, failed_items)) - else: - logger.info("Pre-test sanity check failed on %s, try to recover, recover_method=%s" % (a_dutname, recover_method)) - recover(duthosts[a_dutname], localhost, fanouthosts, a_dut_results, recover_method) - - pt_assert(allow_recover or not pre_sanity_failed, "Pre-test sanity check failed on DUTs, allow_recover=False:{}".format(check_results)) - - if allow_recover and pre_sanity_failed: - logger.info("Run sanity check again after recovery") - new_check_results = do_checks(duthosts, check_items) - logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check after recovery results: !!!!!!!!!!!!!!!!\n%s" % \ - json.dumps(new_check_results, indent=4)) - pre_sanity_failed_after_recover = False - for a_dutname, a_dut_new_results in new_check_results.items(): - if any([result["failed"] for result in a_dut_new_results]): - pre_sanity_failed_after_recover = True - failed_items = json.dumps([result for result in a_dut_new_results if result["failed"]], indent=4) - logger.error("On {}, failed check items after recover:\n{}".format(a_dutname, failed_items)) - - pt_assert(not pre_sanity_failed_after_recover, "Pre-test sanity check failed on DUTs after recover:\n{}".format(new_check_results)) + # Dynamically attach selected check fixtures to node + for item in check_items: + request.fixturenames.append(_item2fixture(item)) + + print_logs(duthosts) + + logger.info("Start pre-test sanity checks") + check_results = do_checks(request, check_items) + logger.debug("Pre-test sanity check results:\n%s" % json.dumps(check_results, indent=4)) + + failed_results = [result for result in check_results if result['failed']] + if failed_results: + if not allow_recover: + pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\ + .format(json.dumps(failed_results, indent=4))) + else: + dut_failed_results = defaultdict(list) + for failed_result in failed_results: + if 'host' in failed_result: + dut_failed_results[failed_result['host']].append(failed_result) + for dut_name, dut_results in dut_failed_results.items(): + recover(duthosts[dut_name], localhost, fanouthosts, dut_results, recover_method) + + logger.info("Run sanity check again after recovery") + new_check_results = do_checks(request, check_items) + logger.debug("Pre-test sanity check after recovery results:\n%s" % json.dumps(new_check_results, indent=4)) + + new_failed_results = [result for result in new_check_results if result['failed']] + if new_failed_results: + pt_assert(False, "!!!!!!!!!!!!!!!! Pre-test sanity check after recovery failed: !!!!!!!!!!!!!!!!\n{}"\ + .format(json.dumps(new_failed_results, indent=4))) logger.info("Done pre-test sanity check") yield - logger.info("Start post-test sanity check") - if not post_check: logger.info("No post-test check is required. Done post-test sanity check") return - post_check_results = do_checks(duthosts, check_items) - logger.info("!!!!!!!!!!!!!!!! Post-test sanity check results: !!!!!!!!!!!!!!!!\n%s" % \ - json.dumps(post_check_results, indent=4)) - post_sanity_failed = False - for a_dutname, a_dut_post_results in post_check_results.items(): - if any([result["failed"] for result in a_dut_post_results]): - post_sanity_failed = True - failed_items = json.dumps([result for result in a_dut_post_results if result["failed"]], indent=4) - logger.error("On {}, failed check items after recover:\n{}".format(a_dutname, failed_items)) - - pt_assert(not post_sanity_failed, "Post-test sanity check failed on DUTs after recover:\n{}".format(post_check_results)) + logger.info("Start post-test sanity check") + post_check_results = do_checks(request, check_items) + logger.debug("Post-test sanity check results:\n%s" % json.dumps(post_check_results, indent=4)) + + post_failed_results = [result for result in post_check_results if result['failed']] + if post_failed_results: + pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\ + .format(json.dumps(post_failed_results, indent=4))) logger.info("Done post-test sanity check") return diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index 9845fd72718..5e4152f39ee 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -3,6 +3,8 @@ import logging import time +import pytest + from tests.common.utilities import wait, wait_until logger = logging.getLogger(__name__) @@ -10,37 +12,52 @@ MONIT_STABILIZE_MAX_TIME = 420 OMEM_THRESHOLD_BYTES=10485760 # 10MB -def check_services(dut): - logger.info("Checking services status on %s..." % dut.hostname) - - networking_uptime = dut.get_networking_uptime().seconds - timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) - interval = 20 - logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ - (networking_uptime, timeout, interval)) - - check_result = {"failed": True, "check_item": "services"} - if timeout == 0: # Check services status, do not retry. - services_status = dut.critical_services_status() - check_result["failed"] = False if all(services_status.values()) else True - check_result["services_status"] = services_status - else: # Retry checking service status - start = time.time() - elapsed = 0 - while elapsed < timeout: - services_status = dut.critical_services_status() - check_result["failed"] = False if all(services_status.values()) else True - check_result["services_status"] = services_status - - if check_result["failed"]: - wait(interval, msg="Not all services are started, wait %d seconds to retry. Remaining time: %d %s" % \ - (interval, int(timeout - elapsed), str(check_result["services_status"]))) - elapsed = time.time() - start - else: - break +__all__ = [ + 'check_services', + 'check_interfaces', + 'check_bgp', + 'check_dbmemory', + 'check_monit', + 'check_processes'] + + +@pytest.fixture(scope="module") +def check_services(duthosts): + def _check(): + check_results = [] + for dut in duthosts: + logger.info("Checking services status on %s..." % dut.hostname) + + networking_uptime = dut.get_networking_uptime().seconds + timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) + interval = 20 + logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ + (networking_uptime, timeout, interval)) + + check_result = {"failed": True, "check_item": "services", "host": dut.hostname} + if timeout == 0: # Check services status, do not retry. + services_status = dut.critical_services_status() + check_result["failed"] = False if all(services_status.values()) else True + check_result["services_status"] = services_status + else: # Retry checking service status + start = time.time() + elapsed = 0 + while elapsed < timeout: + services_status = dut.critical_services_status() + check_result["failed"] = False if all(services_status.values()) else True + check_result["services_status"] = services_status + + if check_result["failed"]: + wait(interval, msg="Not all services are started, wait %d seconds to retry. Remaining time: %d %s" % \ + (interval, int(timeout - elapsed), str(check_result["services_status"]))) + elapsed = time.time() - start + else: + break - logger.info("Done checking services status.") - return check_result + logger.info("Done checking services status on %s" % dut.hostname) + check_results.append(check_result) + return check_results + return _check def _find_down_phy_ports(dut, phy_interfaces): @@ -85,107 +102,119 @@ def _find_down_ports(dut, phy_interfaces, ip_interfaces): return down_ports -def check_interfaces(dut): - logger.info("Checking interfaces status on %s..." % dut.hostname) - - networking_uptime = dut.get_networking_uptime().seconds - timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) - interval = 20 - logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ - (networking_uptime, timeout, interval)) - - down_ports = [] - check_result = {"failed": True, "check_item": "interfaces"} - for asic in dut.asics: - ip_interfaces = [] - cfg_facts = asic.config_facts(host=dut.hostname, - source="persistent")['ansible_facts'] - phy_interfaces = [k for k, v in cfg_facts["PORT"].items() if "admin_status" in v and v["admin_status"] == "up"] - if "PORTCHANNEL_INTERFACE" in cfg_facts: - ip_interfaces = cfg_facts["PORTCHANNEL_INTERFACE"].keys() - if "VLAN_INTERFACE" in cfg_facts: - ip_interfaces += cfg_facts["VLAN_INTERFACE"].keys() - - logger.info(json.dumps(phy_interfaces, indent=4)) - logger.info(json.dumps(ip_interfaces, indent=4)) - - if timeout == 0: # Check interfaces status, do not retry. - down_ports += _find_down_ports(asic, phy_interfaces, ip_interfaces) +@pytest.fixture(scope="module") +def check_interfaces(duthosts): + def _check(): + check_results = [] + for dut in duthosts: + logger.info("Checking interfaces status on %s..." % dut.hostname) + + networking_uptime = dut.get_networking_uptime().seconds + timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) + interval = 20 + logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ + (networking_uptime, timeout, interval)) + + down_ports = [] + check_result = {"failed": True, "check_item": "interfaces", "host": dut.hostname} + for asic in dut.asics: + ip_interfaces = [] + cfg_facts = asic.config_facts(host=dut.hostname, + source="persistent")['ansible_facts'] + phy_interfaces = [k for k, v in cfg_facts["PORT"].items() if "admin_status" in v and v["admin_status"] == "up"] + if "PORTCHANNEL_INTERFACE" in cfg_facts: + ip_interfaces = cfg_facts["PORTCHANNEL_INTERFACE"].keys() + if "VLAN_INTERFACE" in cfg_facts: + ip_interfaces += cfg_facts["VLAN_INTERFACE"].keys() + + logger.info(json.dumps(phy_interfaces, indent=4)) + logger.info(json.dumps(ip_interfaces, indent=4)) + + if timeout == 0: # Check interfaces status, do not retry. + down_ports += _find_down_ports(asic, phy_interfaces, ip_interfaces) + check_result["failed"] = True if len(down_ports) > 0 else False + check_result["down_ports"] = down_ports + else: # Retry checking interface status + start = time.time() + elapsed = 0 + while elapsed < timeout: + down_ports = _find_down_ports(asic, phy_interfaces, ip_interfaces) + check_result["failed"] = True if len(down_ports) > 0 else False + check_result["down_ports"] = down_ports + + if check_result["failed"]: + wait(interval, msg="Found down ports, wait %d seconds to retry. Remaining time: %d, down_ports=%s" % \ + (interval, int(timeout - elapsed), str(check_result["down_ports"]))) + elapsed = time.time() - start + else: + break + + logger.info("Done checking interfaces status on %s" % dut.hostname) check_result["failed"] = True if len(down_ports) > 0 else False check_result["down_ports"] = down_ports - else: # Retry checking interface status - start = time.time() - elapsed = 0 - while elapsed < timeout: - down_ports = _find_down_ports(asic, phy_interfaces, ip_interfaces) - check_result["failed"] = True if len(down_ports) > 0 else False - check_result["down_ports"] = down_ports - - if check_result["failed"]: - wait(interval, msg="Found down ports, wait %d seconds to retry. Remaining time: %d, down_ports=%s" % \ - (interval, int(timeout - elapsed), str(check_result["down_ports"]))) - elapsed = time.time() - start - else: - break - - logger.info("Done checking interfaces status.") - check_result["failed"] = True if len(down_ports) > 0 else False - check_result["down_ports"] = down_ports - return check_result - - -def check_bgp_status(dut): - - def _check_bgp_status_helper(): - asic_check_results = [] - bgp_facts = dut.bgp_facts(asic_index='all') - for asic_index, a_asic_facts in enumerate(bgp_facts): - a_asic_result = False - a_asic_neighbors = a_asic_facts['ansible_facts']['bgp_neighbors'] - if a_asic_neighbors: - down_neighbors = [k for k, v in a_asic_neighbors.items() - if v['state'] != 'established'] - if down_neighbors: - if dut.facts['num_asic'] == 1: - check_result['bgp'] = {'down_neighbors' : down_neighbors } - else: - check_result['bgp' + str(asic_index)] = {'down_neighbors' : down_neighbors } - a_asic_result = True - else: + check_results.append(check_result) + return check_results + return _check + + +@pytest.fixture(scope="module") +def check_bgp(duthosts): + def _check(): + check_results = [] + for dut in duthosts: + def _check_bgp_status_helper(): + asic_check_results = [] + bgp_facts = dut.bgp_facts(asic_index='all') + for asic_index, a_asic_facts in enumerate(bgp_facts): a_asic_result = False - if dut.facts['num_asic'] == 1: - if 'bgp' in check_result: - check_result['bgp'].pop('down_neighbors', None) + a_asic_neighbors = a_asic_facts['ansible_facts']['bgp_neighbors'] + if a_asic_neighbors: + down_neighbors = [k for k, v in a_asic_neighbors.items() + if v['state'] != 'established'] + if down_neighbors: + if dut.facts['num_asic'] == 1: + check_result['bgp'] = {'down_neighbors' : down_neighbors } + else: + check_result['bgp' + str(asic_index)] = {'down_neighbors' : down_neighbors } + a_asic_result = True + else: + a_asic_result = False + if dut.facts['num_asic'] == 1: + if 'bgp' in check_result: + check_result['bgp'].pop('down_neighbors', None) + else: + if 'bgp' + str(asic_index) in check_result: + check_result['bgp' + str(asic_index)].pop('down_neighbors', None) else: - if 'bgp' + str(asic_index) in check_result: - check_result['bgp' + str(asic_index)].pop('down_neighbors', None) + a_asic_result = True + + asic_check_results.append(a_asic_result) + + if any(asic_check_results): + check_result['failed'] = True + return not check_result['failed'] + + logger.info("Checking bgp status on host %s ..." % dut.hostname) + check_result = {"failed": False, "check_item": "bgp", "host": dut.hostname} + + networking_uptime = dut.get_networking_uptime().seconds + timeout = max(SYSTEM_STABILIZE_MAX_TIME - networking_uptime, 1) + interval = 20 + wait_until(timeout, interval, _check_bgp_status_helper) + if (check_result['failed']): + for a_result in check_result.keys(): + if a_result != 'failed': + # Dealing with asic result + if 'down_neighbors' in check_result[a_result]: + logger.info('BGP neighbors down: %s on bgp instance %s on dut %s' % (check_result[a_result]['down_neighbors'], a_result, dut.hostname)) else: - a_asic_result = True - - asic_check_results.append(a_asic_result) - - if any(asic_check_results): - check_result['failed'] = True - return not check_result['failed'] - - logger.info("Checking bgp status on host %s ..." % dut.hostname) - check_result = {"failed": False, "check_item": "bgp"} - - networking_uptime = dut.get_networking_uptime().seconds - timeout = max(SYSTEM_STABILIZE_MAX_TIME - networking_uptime, 1) - interval = 20 - wait_until(timeout, interval, _check_bgp_status_helper) - if (check_result['failed']): - for a_result in check_result.keys(): - if a_result != 'failed': - # Dealing with asic result - if 'down_neighbors' in check_result[a_result]: - logger.info('BGP neighbors down: %s on bgp instance %s on dut %s' % (check_result[a_result]['down_neighbors'], a_result, dut.hostname)) - else: - logger.info('No BGP neighbors are down on %s' % dut.hostname) - - logger.info("Done checking bgp status on %s" % dut.hostname) - return check_result + logger.info('No BGP neighbors are down on %s' % dut.hostname) + + logger.info("Done checking bgp status on %s" % dut.hostname) + check_results.append(check_result) + + return check_results + return _check def _is_db_omem_over_threshold(command_output): @@ -199,30 +228,37 @@ def _is_db_omem_over_threshold(command_output): if m: omem = int(m.group(1)) total_omem += omem - logger.info(json.dumps(command_output, indent=4)) + logger.debug(json.dumps(command_output, indent=4)) if total_omem > OMEM_THRESHOLD_BYTES: result = True return result, total_omem -def check_dbmemory(dut): - logger.info("Checking database memory on %s..." % dut.hostname) - redis_cmd = "client list" - check_result = {"failed": False, "check_item": "dbmemory"} - # check the db memory on the redis instance running on each instance - for asic in dut.asics: - res = asic.run_redis_cli_cmd(redis_cmd)['stdout_lines'] - result, total_omem = _is_db_omem_over_threshold(res) - if result: - check_result["failed"] = True - check_result["total_omem"] = total_omem - logging.info("{} db memory over the threshold ".format(str(asic.namespace or ''))) - break - logger.info("Done checking database memory") - return check_result +@pytest.fixture(scope="module") +def check_dbmemory(duthosts): + def _check(): + check_results = [] + for dut in duthosts: + logger.info("Checking database memory on %s..." % dut.hostname) + redis_cmd = "client list" + check_result = {"failed": False, "check_item": "dbmemory", "host": dut.hostname} + # check the db memory on the redis instance running on each instance + for asic in dut.asics: + res = asic.run_redis_cli_cmd(redis_cmd)['stdout_lines'] + result, total_omem = _is_db_omem_over_threshold(res) + if result: + check_result["failed"] = True + check_result["total_omem"] = total_omem + logging.info("{} db memory over the threshold ".format(str(asic.namespace or ''))) + break + logger.info("Done checking database memory on %s" % dut.hostname) + check_results.append(check_result) + return check_results + return _check -def check_monit_services_status(check_result, monit_services_status): + +def _check_monit_services_status(check_result, monit_services_status): """ @summary: Check whether each type of service which was monitored by Monit was in correct status or not. If a service was in "Not monitored" status, sanity check will skip it since this service @@ -241,129 +277,114 @@ def check_monit_services_status(check_result, monit_services_status): return check_result -def check_monit(dut): + +@pytest.fixture(scope="module") +def check_monit(duthosts): """ - @summary: Check whether the Monit is running and whether the services which were monitored by Monit are + @summary: Check whether the Monit is running and whether the services which were monitored by Monit are in the correct status or not. @return: A dictionary contains the testing result (failed or not failed) and the status of each service. """ - logger.info("Checking status of each Monit service...") - networking_uptime = dut.get_networking_uptime().seconds - timeout = max((MONIT_STABILIZE_MAX_TIME - networking_uptime), 0) - interval = 20 - logger.info("networking_uptime = {} seconds, timeout = {} seconds, interval = {} seconds" \ - .format(networking_uptime, timeout, interval)) - - check_result = {"failed": False, "check_item": "monit"} - - if timeout == 0: - monit_services_status = dut.get_monit_services_status() - if not monit_services_status: - logger.info("Monit was not running.") - check_result["failed"] = True - check_result["failed_reason"] = "Monit was not running" - logger.info("Checking status of each Monit service was done!") - return check_result - - check_result = check_monit_services_status(check_result, monit_services_status) - else: - start = time.time() - elapsed = 0 - is_monit_running = False - while elapsed < timeout: - check_result["failed"] = False - monit_services_status = dut.get_monit_services_status() - if not monit_services_status: - wait(interval, msg="Monit was not started and wait {} seconds to retry. Remaining time: {}." \ - .format(interval, timeout - elapsed)) - elapsed = time.time() - start - continue - - is_monit_running = True - check_result = check_monit_services_status(check_result, monit_services_status) - if check_result["failed"]: - wait(interval, msg="Services were not monitored and wait {} seconds to retry. Remaining time: {}. Services status: {}" \ - .format(interval, timeout - elapsed, str(check_result["services_status"]))) - elapsed = time.time() - start + def _check(): + check_results = [] + for dut in duthosts: + logger.info("Checking status of each Monit service...") + networking_uptime = dut.get_networking_uptime().seconds + timeout = max((MONIT_STABILIZE_MAX_TIME - networking_uptime), 0) + interval = 20 + logger.info("networking_uptime = {} seconds, timeout = {} seconds, interval = {} seconds" \ + .format(networking_uptime, timeout, interval)) + + check_result = {"failed": False, "check_item": "monit", "host": dut.hostname} + + if timeout == 0: + monit_services_status = dut.get_monit_services_status() + if not monit_services_status: + logger.info("Monit was not running.") + check_result["failed"] = True + check_result["failed_reason"] = "Monit was not running" + logger.info("Checking status of each Monit service was done!") + return check_result + + check_result = _check_monit_services_status(check_result, monit_services_status) else: - break - - if not is_monit_running: - logger.info("Monit was not running.") - check_result["failed"] = True - check_result["failed_reason"] = "Monit was not running" - - logger.info("Checking status of each Monit service was done!") - return check_result - -def check_processes(dut): - logger.info("Checking process status on %s..." % dut.hostname) - - networking_uptime = dut.get_networking_uptime().seconds - timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) - interval = 20 - logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ - (networking_uptime, timeout, interval)) - - check_result = {"failed": False, "check_item": "processes"} - if timeout == 0: # Check processes status, do not retry. - processes_status = dut.all_critical_process_status() - check_result["processes_status"] = processes_status - check_result["services_status"] = {} - for k, v in processes_status.items(): - if v['status'] == False or len(v['exited_critical_process']) > 0: - check_result['failed'] = True - check_result["services_status"].update({k: v['status']}) - else: # Retry checking processes status - start = time.time() - elapsed = 0 - while elapsed < timeout: - check_result["failed"] = False - processes_status = dut.all_critical_process_status() - check_result["processes_status"] = processes_status - check_result["services_status"] = {} - for k, v in processes_status.items(): - if v['status'] == False or len(v['exited_critical_process']) > 0: - check_result['failed'] = True - check_result["services_status"].update({k: v['status']}) - - if check_result["failed"]: - wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \ - (interval, int(timeout - elapsed), str(check_result["processes_status"]))) - elapsed = time.time() - start - else: - break + start = time.time() + elapsed = 0 + is_monit_running = False + while elapsed < timeout: + check_result["failed"] = False + monit_services_status = dut.get_monit_services_status() + if not monit_services_status: + wait(interval, msg="Monit was not started and wait {} seconds to retry. Remaining time: {}." \ + .format(interval, timeout - elapsed)) + elapsed = time.time() - start + continue + + is_monit_running = True + check_result = _check_monit_services_status(check_result, monit_services_status) + if check_result["failed"]: + wait(interval, msg="Services were not monitored and wait {} seconds to retry. Remaining time: {}. Services status: {}" \ + .format(interval, timeout - elapsed, str(check_result["services_status"]))) + elapsed = time.time() - start + else: + break + + if not is_monit_running: + logger.info("Monit was not running.") + check_result["failed"] = True + check_result["failed_reason"] = "Monit was not running" + + logger.info("Checking status of each Monit service was done on %s" % dut.hostname) + check_results.append(check_result) + + return check_results + return _check + + +@pytest.fixture(scope="module") +def check_processes(duthosts): + def _check(): + check_results = [] + for dut in duthosts: + logger.info("Checking process status on %s..." % dut.hostname) + + networking_uptime = dut.get_networking_uptime().seconds + timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0) + interval = 20 + logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ + (networking_uptime, timeout, interval)) + + check_result = {"failed": False, "check_item": "processes", "host": dut.hostname} + if timeout == 0: # Check processes status, do not retry. + processes_status = dut.all_critical_process_status() + check_result["processes_status"] = processes_status + check_result["services_status"] = {} + for k, v in processes_status.items(): + if v['status'] == False or len(v['exited_critical_process']) > 0: + check_result['failed'] = True + check_result["services_status"].update({k: v['status']}) + else: # Retry checking processes status + start = time.time() + elapsed = 0 + while elapsed < timeout: + check_result["failed"] = False + processes_status = dut.all_critical_process_status() + check_result["processes_status"] = processes_status + check_result["services_status"] = {} + for k, v in processes_status.items(): + if v['status'] == False or len(v['exited_critical_process']) > 0: + check_result['failed'] = True + check_result["services_status"].update({k: v['status']}) + + if check_result["failed"]: + wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \ + (interval, int(timeout - elapsed), str(check_result["processes_status"]))) + elapsed = time.time() - start + else: + break - logger.info("Done checking processes status.") - return check_result + logger.info("Done checking processes status on %s" % dut.hostname) + check_results.append(check_result) -def do_checks(duthosts, check_items): - results = {} - for dut in duthosts: - results[dut.hostname] = [] - for item in check_items: - if item == "services": - results[dut.hostname].append(check_services(dut)) - elif item == "interfaces": - if dut in duthosts.frontend_nodes: - results[dut.hostname].append(check_interfaces(dut)) - elif item == "dbmemory": - results [dut.hostname].append(check_dbmemory(dut)) - elif item == "processes": - results[dut.hostname].append(check_processes(dut)) - elif item == "bgp": - if dut in duthosts.frontend_nodes: - results[dut.hostname].append(check_bgp_status(dut)) - elif item == "monit": - results[dut.hostname].append(check_monit(dut)) - - return results - - -def print_logs(duthosts, print_logs): - for per_host in duthosts: - logger.info("Run commands to print logs, logs to be collected on %s:\n %s" % (per_host.hostname, json.dumps(print_logs, indent=4))) - for item in print_logs: - cmd = print_logs[item] - res = per_host.shell(cmd, module_ignore_errors=True) - logger.info("cmd='%s', output:\n%s" % (cmd, json.dumps(res["stdout_lines"], indent=4))) + return check_results + return _check diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py index 1ec8c45d25e..4daa1c74825 100644 --- a/tests/common/plugins/sanity_check/constants.py +++ b/tests/common/plugins/sanity_check/constants.py @@ -20,6 +20,3 @@ "fast_reboot": {"cmd": "fast_reboot", "reboot": True, "adaptive": False, 'recover_wait': 120}, "adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30}, } # All supported recover methods - -SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp", "monit"] # Supported checks -DEFAULT_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp", "monit"] # Default checks