sonic-net · wangxin · Jan 29, 2021 · Jan 28, 2021 · Jan 29, 2021
diff --git a/tests/common/plugins/sanity_check/README.md b/tests/common/plugins/sanity_check/README.md
@@ -52,13 +52,13 @@ We can use keyword argument `check_items` to fine tune the items to be checked i
 * interfaces: Check the status of network interfaces.
 Please refer to sonic-mgmt/tests/common/plugins/sanity_check/constants::SUPPORTED_CHECK_ITEMS for the latest supported check items.
 
-Value for `check_items` should be a tuple or list of strings. Each item in the tuple or list should be a string. The string can be name of the supported check items with optional prefix `+` or `-`. Unsupported check items will be ignored.
+Value for `check_items` should be a tuple or list of strings. Each item in the tuple or list should be a string. The string can be name of the supported check items with optional prefix `+` or `-` or `_`. Unsupported check items will be ignored.
 
-If a supported check item is prefixed with `-`, then this item will not be checked in sanity. For items with prefix `+` or without prefixes, the item should be included in the list of items to be checked in sanity.
+If a supported check item is prefixed with `-` or `_`, then this item will not be checked in sanity. For items with prefix `+` or without prefixes, the item should be included in the list of items to be checked in sanity.
 
 With this design, we can extend the sanity check items in the future. By default, only a very basic set of sanity check is performed. For some test scripts that do not need some default sanity check items or need some extra sanity check items, we can use this syntax to tailor the check items that fit best for the current test script.
 
-User can change check item list by passing parameter from command line --check_items="add remove string". Exmaple: --check_items="-services,+bgp" means do not check services, but add bgp to the check list. This parameter is not an absolute list, it is addition or subtraction from the existing list.
+User can change check item list by passing parameter from command line --check_items="add remove string". Example: --check_items="_services,+bgp" means do not check services, but add bgp to the check list. This parameter is not an absolute list, it is addition or subtraction from the existing list. On command line "-" has special meaning. So, we need to prefix "_" to skip a check item.
 
 ## Log collecting
 If sanity check is to be performed, the script will also run some commands on the DUT to collect some basic information for debugging. Please refer to sonic-mgmt/tests/common/plugins/sanity_check/constants::PRINT_LOGS for the list of logs that will be collected.
@@ -77,6 +77,15 @@ The sanity check plugin also supports pytest command line option `--allow_recove
 $ pytest -i inventory --host-pattern switch1-t0 --module-path ../ansible/library/ --testbed switch1-t0 --testbed-file testbed.csv --log-cli-level info test_something.py --allow_recover
 ```
 
+## Check item
+The check items are defined in the `checks.py` module. In the original design, check item is defined as an ordinary function. All the dependent fixtures must be specified in the argument list of `sanity_check`. Then objects of the fixtures are passed to the check functions as arguments. However, this design has a limitation. Not all the sanity check dependent fixtures are supported on all topologies. On some topologies, sanity check may fail with getting those fixtures.
+To resolve that issue, we have changed the design. Now the check items must be defined as fixtures. Then the check fixtures can be dynamically attached to test cases during run time. In the sanity check plugin, we can check the current testbed type or other conditions to decide whether or not to load certain check fixtures.
+
+### Check item implementation details
+Each check fixture must use the factory design pattern to return a check function. Then we can delay execution of the various sanity checks after the sanity check items have been dynamically adjusted.
+
+Check fixture must be named with pattern `check_<item name>`. When a new check fixture is defined, its name must be added to the `__all__` list of the `checks.py` module.
+
 ## Why check networking uptime?
 
 The sanity check may be performed right after the DUT is rebooted or config reload is performed. In this case, services and interfaces may not be ready yet and sanity check will fail unnecessarily.

diff --git a/tests/common/plugins/sanity_check/__init__.py b/tests/common/plugins/sanity_check/__init__.py
@@ -1,18 +1,30 @@
 
 import logging
-import random
 import copy
 import json
 
 import pytest
 
-import constants
-from checks import do_checks, print_logs
-from recover import recover
+from inspect import getmembers, isfunction
+from collections import defaultdict
+
+from tests.common.plugins.sanity_check import constants
+from tests.common.plugins.sanity_check import checks
+from tests.common.plugins.sanity_check.checks import *
+from tests.common.plugins.sanity_check.recover import recover
 from tests.common.helpers.assertions import pytest_assert as pt_assert
 
+from tests.common.plugins.sanity_check.checks import check_monit
+
 logger = logging.getLogger(__name__)
 
+SUPPORTED_CHECKS = [member[0].replace('check_', '') for member in getmembers(checks, isfunction)
+                    if member[0].startswith('check_')]
+
+
+def _item2fixture(item):
+    return 'check_' + item
+
 
 def _update_check_items(old_items, new_items, supported_items):
     """
@@ -25,30 +37,52 @@ def _update_check_items(old_items, new_items, supported_items):
     for new_item in new_items:
         if not new_item:
             continue
-        if new_item[0] == "-":      # Remove default check item
+        if new_item[0] in ["_", "-"]:      # Remove default check item
             new_item = new_item[1:]
             if new_item in updated_items:
                 logger.info("Skip checking '%s'" % new_item)
                 updated_items.remove(new_item)
         else:                       # Add a check item
             if new_item[0] == "+":
                 new_item = new_item[1:]
-            if new_item in supported_items:
-                logger.info("Add checking '%s'" % new_item)
-                updated_items.add(new_item)
+            if new_item in supported_items :
+                if new_item not in updated_items:
+                    logger.info("Add checking '{}'".format(new_item))
+                    updated_items.add(new_item)
             else:
-                logger.warning("Unsupported sanity checking: '%s'" % new_item)
+                logger.warning('Check item "{}" no in supported check items: {}'.format(new_item, supported_items))
     return updated_items
 
 
+def print_logs(duthosts):
+    for dut in duthosts:
+        logger.info("Run commands to print logs, logs to be collected on {}:\n{}"\
+            .format(dut.hostname, json.dumps(constants.PRINT_LOGS, indent=4)))
+        for cmd in constants.PRINT_LOGS.values():
+            res = dut.shell(cmd, module_ignore_errors=True)
+            logger.info("cmd='%s', output:\n%s" % (cmd, json.dumps(res["stdout_lines"], indent=4)))
+
+
+def do_checks(request, check_items):
+    check_results = []
+    for item in check_items:
+        check_fixture = request.getfixturevalue(_item2fixture(item))
+        results = check_fixture()
+        if results and isinstance(results, list):
+            check_results.extend(results)
+        elif results:
+            check_results.append(results)
+    return check_results
+
+
 @pytest.fixture(scope="module", autouse=True)
 def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo):
-    logger.info("Start pre-test sanity check")
+    logger.info("Prepare pre-test sanity check")
 
     skip_sanity = False
     allow_recover = False
     recover_method = "adaptive"
-    check_items = set(copy.deepcopy(constants.DEFAULT_CHECK_ITEMS))  # Default check items
+    check_items = set(copy.deepcopy(SUPPORTED_CHECKS))  # Default check items
     post_check = False
 
     customized_sanity_check = None
@@ -59,7 +93,8 @@ def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo):
             break
 
     if customized_sanity_check:
-        logger.info("Process marker %s in script. m.args=%s, m.kwargs=%s" % (m.name, str(m.args), str(m.kwargs)))
+        logger.info("Process marker {} in script. m.args={}, m.kwargs={}"
+            .format(customized_sanity_check.name, customized_sanity_check.args, customized_sanity_check.kwargs))
         skip_sanity = customized_sanity_check.kwargs.get("skip_sanity", False)
         allow_recover = customized_sanity_check.kwargs.get("allow_recover", False)
         recover_method = customized_sanity_check.kwargs.get("recover_method", "adaptive")
@@ -70,17 +105,23 @@ def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo):
 
         check_items = _update_check_items(check_items,
                                           customized_sanity_check.kwargs.get("check_items", []),
-                                          constants.SUPPORTED_CHECK_ITEMS)
+                                          SUPPORTED_CHECKS)
         post_check = customized_sanity_check.kwargs.get("post_check", False)
 
     if request.config.option.skip_sanity:
         skip_sanity = True
+    if skip_sanity:
+        logger.info("Skip sanity check according to command line argument or configuration of test script.")
+        yield
+        return
+
     if request.config.option.allow_recover:
         allow_recover = True
-    items = request.config.getoption("--check_items")
-    if items:
-        items_array=str(items).split(',')
-        check_items = _update_check_items(check_items, items_array, constants.SUPPORTED_CHECK_ITEMS)
+
+    cli_items = request.config.getoption("--check_items")
+    if cli_items:
+        cli_items_list=str(cli_items).split(',')
+        check_items = _update_check_items(check_items, cli_items_list, SUPPORTED_CHECKS)
 
     # ignore BGP check for particular topology type
     if tbinfo['topo']['type'] == 'ptf' and 'bgp' in check_items:
@@ -89,70 +130,60 @@ def sanity_check(localhost, duthosts, request, fanouthosts, tbinfo):
     logger.info("Sanity check settings: skip_sanity=%s, check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s" % \
         (skip_sanity, check_items, allow_recover, recover_method, post_check))
 
-    if skip_sanity:
-        logger.info("Skip sanity check according to command line argument or configuration of test script.")
-        yield
-        return
-
     if not check_items:
         logger.info("No sanity check item is specified, no pre-test sanity check")
         yield
         logger.info("No sanity check item is specified, no post-test sanity check")
         return
 
-    print_logs(duthosts, constants.PRINT_LOGS)
-    check_results = do_checks(duthosts, check_items)
-    logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check results: !!!!!!!!!!!!!!!!\n%s" % \
-                json.dumps(check_results, indent=4))
-
-    pre_sanity_failed = False
-    for a_dutname, a_dut_results in check_results.items():
-        if any([result["failed"] for result in a_dut_results]):
-            pre_sanity_failed = True
-            if not allow_recover:
-                failed_items = json.dumps([result for result in a_dut_results if result["failed"]], indent=4)
-                logger.error("On {}, failed pre-sanity check items with allow_recover=False:\n{}".format(a_dutname, failed_items))
-            else:
-                logger.info("Pre-test sanity check failed on %s, try to recover, recover_method=%s" % (a_dutname, recover_method))
-                recover(duthosts[a_dutname], localhost, fanouthosts, a_dut_results, recover_method)
-
-    pt_assert(allow_recover or not pre_sanity_failed, "Pre-test sanity check failed on DUTs, allow_recover=False:{}".format(check_results))
-
-    if allow_recover and pre_sanity_failed:
-        logger.info("Run sanity check again after recovery")
-        new_check_results = do_checks(duthosts, check_items)
-        logger.info("!!!!!!!!!!!!!!!! Pre-test sanity check after recovery results: !!!!!!!!!!!!!!!!\n%s" % \
-                    json.dumps(new_check_results, indent=4))
-        pre_sanity_failed_after_recover = False
-        for a_dutname, a_dut_new_results in new_check_results.items():
-            if any([result["failed"] for result in a_dut_new_results]):
-                pre_sanity_failed_after_recover = True
-                failed_items = json.dumps([result for result in a_dut_new_results if result["failed"]], indent=4)
-                logger.error("On {}, failed check items after recover:\n{}".format(a_dutname, failed_items))
-
-        pt_assert(not pre_sanity_failed_after_recover, "Pre-test sanity check failed on DUTs after recover:\n{}".format(new_check_results))
+    # Dynamically attach selected check fixtures to node
+    for item in check_items:
+        request.fixturenames.append(_item2fixture(item))
+
+    print_logs(duthosts)
+
+    logger.info("Start pre-test sanity checks")
+    check_results = do_checks(request, check_items)
+    logger.debug("Pre-test sanity check results:\n%s" % json.dumps(check_results, indent=4))
+
+    failed_results = [result for result in check_results if result['failed']]
+    if failed_results:
+        if not allow_recover:
+            pt_assert(False, "!!!!!!!!!!!!!!!!Pre-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\
+                .format(json.dumps(failed_results, indent=4)))
+        else:
+            dut_failed_results = defaultdict(list)
+            for failed_result in failed_results:
+                if 'host' in failed_result:
+                    dut_failed_results[failed_result['host']].append(failed_result)
+            for dut_name, dut_results in dut_failed_results.items():
+                recover(duthosts[dut_name], localhost, fanouthosts, dut_results, recover_method)
+
+            logger.info("Run sanity check again after recovery")
+            new_check_results = do_checks(request, check_items)
+            logger.debug("Pre-test sanity check after recovery results:\n%s" % json.dumps(new_check_results, indent=4))
+
+            new_failed_results = [result for result in new_check_results if result['failed']]
+            if new_failed_results:
+                pt_assert(False, "!!!!!!!!!!!!!!!! Pre-test sanity check after recovery failed: !!!!!!!!!!!!!!!!\n{}"\
+                    .format(json.dumps(new_failed_results, indent=4)))
 
     logger.info("Done pre-test sanity check")
 
     yield
 
-    logger.info("Start post-test sanity check")
-
     if not post_check:
         logger.info("No post-test check is required. Done post-test sanity check")
         return
 
-    post_check_results = do_checks(duthosts, check_items)
-    logger.info("!!!!!!!!!!!!!!!! Post-test sanity check results: !!!!!!!!!!!!!!!!\n%s" % \
-                json.dumps(post_check_results, indent=4))
-    post_sanity_failed = False
-    for a_dutname, a_dut_post_results in post_check_results.items():
-        if any([result["failed"] for result in a_dut_post_results]):
-            post_sanity_failed = True
-            failed_items = json.dumps([result for result in a_dut_post_results if result["failed"]], indent=4)
-            logger.error("On {}, failed check items after recover:\n{}".format(a_dutname, failed_items))
-
-    pt_assert(not post_sanity_failed, "Post-test sanity check failed on DUTs after recover:\n{}".format(post_check_results))
+    logger.info("Start post-test sanity check")
+    post_check_results = do_checks(request, check_items)
+    logger.debug("Post-test sanity check results:\n%s" % json.dumps(post_check_results, indent=4))
+
+    post_failed_results = [result for result in post_check_results if result['failed']]
+    if post_failed_results:
+        pt_assert(False, "!!!!!!!!!!!!!!!! Post-test sanity check failed: !!!!!!!!!!!!!!!!\n{}"\
+            .format(json.dumps(post_failed_results, indent=4)))
 
     logger.info("Done post-test sanity check")
     return