From 7c9dd996bf595d6efb53aa412d23469e9db94260 Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Fri, 24 Jul 2020 21:53:20 +0000 Subject: [PATCH 1/2] [sanity check] add BGP to the sanity check list - Generate warning log when sanity check recovery action is taken. - Check BGP neighbor health by default. - Separate the default check list from supported check list. - Allow disabling check with command line parameters. - Increase load_config recovery wait time to 120 seconds. - Address variable used before initialize issue in bgp_facts.py. Signed-off-by: Ying Xie --- ansible/library/bgp_facts.py | 4 +++- tests/common/devices.py | 12 ++++++++++ tests/common/plugins/sanity_check/README.md | 2 ++ tests/common/plugins/sanity_check/__init__.py | 6 ++++- tests/common/plugins/sanity_check/checks.py | 23 +++++++++++++++++++ .../common/plugins/sanity_check/constants.py | 5 ++-- tests/common/plugins/sanity_check/recover.py | 9 ++++---- tests/conftest.py | 2 ++ 8 files changed, 55 insertions(+), 8 deletions(-) diff --git a/ansible/library/bgp_facts.py b/ansible/library/bgp_facts.py index 471d8cfa2c4..5d757ac63e4 100644 --- a/ansible/library/bgp_facts.py +++ b/ansible/library/bgp_facts.py @@ -142,6 +142,7 @@ def parse_neighbors(self): lines = n.splitlines() neighbor['admin'] = 'up' neighbor['accepted prefixes'] = 0 + neighbor_ip = None for line in lines: if regex_ipv4.match(line): @@ -186,7 +187,8 @@ def parse_neighbors(self): if message_stats: neighbor['message statistics'] = message_stats - neighbors[neighbor_ip] = neighbor + if neighbor_ip: + neighbors[neighbor_ip] = neighbor except Exception as e: self.module.fail_json(msg=str(e)) diff --git a/tests/common/devices.py b/tests/common/devices.py index 1324c56cceb..d80e64f8c92 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -630,6 +630,18 @@ def get_bgp_neighbor_info(self, neighbor_ip): return nbinfo[str(neighbor_ip)] + def get_bgp_neighbors(self): + """ + Get a diction of BGP neighbor states + + Args: None + + Returns: dictionary { (neighbor_ip : info_dict)* } + + """ + bgp_facts = self.bgp_facts()['ansible_facts'] + return bgp_facts['bgp_neighbors'] + def check_bgp_session_state(self, neigh_ips, state="established"): """ @summary: check if current bgp session equals to the target state diff --git a/tests/common/plugins/sanity_check/README.md b/tests/common/plugins/sanity_check/README.md index 22473d5c231..06e9c85688e 100644 --- a/tests/common/plugins/sanity_check/README.md +++ b/tests/common/plugins/sanity_check/README.md @@ -58,6 +58,8 @@ If a supported check item is prefixed with `-`, then this item will not be check With this design, we can extend the sanity check items in the future. By default, only a very basic set of sanity check is performed. For some test scripts that do not need some default sanity check items or need some extra sanity check items, we can use this syntax to tailor the check items that fit best for the current test script. +User can change check item list by passing parameter from command line --check_items="add remove string". Exmaple: --check_items="-services,+bgp" means do not check services, but add bgp to the check list. This parameter is not an absolute list, it is addition or subtraction from the existing list. + ## Log collecting If sanity check is to be performed, the script will also run some commands on the DUT to collect some basic information for debugging. Please refer to sonic-mgmt/tests/common/plugins/sanity_check/constants::PRINT_LOGS for the list of logs that will be collected. diff --git a/tests/common/plugins/sanity_check/__init__.py b/tests/common/plugins/sanity_check/__init__.py index 715fc68f70d..f94b8381f79 100644 --- a/tests/common/plugins/sanity_check/__init__.py +++ b/tests/common/plugins/sanity_check/__init__.py @@ -47,7 +47,7 @@ def sanity_check(localhost, duthost, request, fanouthosts): skip_sanity = False allow_recover = False recover_method = "adaptive" - check_items = set(copy.deepcopy(constants.SUPPORTED_CHECK_ITEMS)) # Default check items + check_items = set(copy.deepcopy(constants.DEFAULT_CHECK_ITEMS)) # Default check items post_check = False customized_sanity_check = None @@ -76,6 +76,10 @@ def sanity_check(localhost, duthost, request, fanouthosts): skip_sanity = True if request.config.option.allow_recover: allow_recover = True + items = request.config.getoption("--check_items") + if items: + items_array=str(items).split(',') + check_items = _update_check_items(check_items, items_array, constants.SUPPORTED_CHECK_ITEMS) logger.info("Sanity check settings: skip_sanity=%s, check_items=%s, allow_recover=%s, recover_method=%s, post_check=%s" % \ (skip_sanity, check_items, allow_recover, recover_method, post_check)) diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index d21655e613b..af80e21581e 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -96,6 +96,27 @@ def check_interfaces(dut): logger.info("Done checking interfaces status.") return check_result +def check_bgp_status(dut): + logger.info("Checking bgp status...") + check_result = {"failed": False, "check_item": "bgp"} + + neis = dut.get_bgp_neighbors() + if not neis: + logger.info("BGP neighbors: None") + check_result["failed"] = True + else: + down_neis = [] + for nei, v in neis.items(): + if v["state"] != "established": + down_neis.append(nei) + if down_neis: + logger.info("BGP neighbors down: {}".format(down_neis)) + check_result["failed"] = True + check_result["down_neighbors"] = down_neis + + logger.info("Done checking bgp status.") + return check_result + def check_dbmemory(dut): logger.info("Checking database memory...") @@ -168,6 +189,8 @@ def do_checks(dut, check_items): results.append(check_dbmemory(dut)) elif item == "processes": results.append(check_processes(dut)) + elif item == "bgp": + results.append(check_bgp_status(dut)) return results diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py index bf622bdb600..73fe89081b1 100644 --- a/tests/common/plugins/sanity_check/constants.py +++ b/tests/common/plugins/sanity_check/constants.py @@ -13,7 +13,7 @@ # Recover related definitions RECOVER_METHODS = { - "config_reload": {"cmd": "bash -c 'config reload -y &>/dev/null'", "reboot": False, "adaptive": False, 'recover_wait': 60}, + "config_reload": {"cmd": "bash -c 'config reload -y &>/dev/null'", "reboot": False, "adaptive": False, 'recover_wait': 120}, "load_minigraph": {"cmd": "bash -c 'config load_minigraph -y &>/dev/null'", "reboot": False, "adaptive": False, 'recover_wait': 60}, "reboot": {"cmd": "reboot", "reboot": True, "adaptive": False, 'recover_wait': 120}, "warm_reboot": {"cmd": "warm-reboot", "reboot": True, "adaptive": False, 'recover_wait': 120}, @@ -21,4 +21,5 @@ "adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30}, } # All supported recover methods -SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"] # Supported checks +SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp"] # Supported checks +DEFAULT_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp"] # Default checks diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py index 55c6eca0c52..8876d39b403 100644 --- a/tests/common/plugins/sanity_check/recover.py +++ b/tests/common/plugins/sanity_check/recover.py @@ -32,7 +32,7 @@ def reboot_dut(dut, localhost, cmd, wait_time): def __recover_interfaces(dut, fanouthosts, result, wait_time): action = None for port in result['down_ports']: - logging.info("Restoring port {}".format(port)) + logging.warning("Restoring port: {}".format(port)) pn = str(port).lower() if 'portchannel' in pn or 'vlan' in pn: @@ -63,12 +63,11 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time): outstanding_action = None for result in check_results: if result['failed']: - logging.info("Restoring {}".format(result)) if result['check_item'] == 'interfaces': action = __recover_interfaces(dut, fanouthosts, result, wait_time) elif result['check_item'] == 'services': action = __recover_services(dut, result) - elif result['check_item'] == 'processes': + elif result['check_item'] in [ 'processes', 'bgp' ]: action = 'config_reload' else: action = 'reboot' @@ -78,6 +77,8 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time): if action and (not outstanding_action or outstanding_action == 'config_reload'): outstanding_action = action + logging.warning("Restoring {} with proposed action: {}, final action: {}".format(result, action, outstanding_action)) + if outstanding_action: method = constants.RECOVER_METHODS[outstanding_action] wait_time = method['recover_wait'] @@ -88,7 +89,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time): def recover(dut, localhost, fanouthosts, check_results, recover_method): - logger.info("Try to recover %s using method %s" % (dut.hostname, recover_method)) + logger.warning("Try to recover %s using method %s" % (dut.hostname, recover_method)) method = constants.RECOVER_METHODS[recover_method] wait_time = method['recover_wait'] if method["adaptive"]: diff --git a/tests/conftest.py b/tests/conftest.py index 06331ea13a6..2124305e96f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -121,6 +121,8 @@ def pytest_addoption(parser): help="Skip sanity check") parser.addoption("--allow_recover", action="store_true", default=False, help="Allow recovery attempt in sanity check in case of failure") + parser.addoption("--check_items", action="store", default=False, + help="Change (add|remove) check items in the check list") ######################## # pre-test options # From bc5af60f4c7707b6325714ade18d70c2eb5b56df Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Sun, 26 Jul 2020 17:48:36 +0000 Subject: [PATCH 2/2] warning when service is down --- tests/common/plugins/sanity_check/recover.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py index 8876d39b403..61b4c3e475d 100644 --- a/tests/common/plugins/sanity_check/recover.py +++ b/tests/common/plugins/sanity_check/recover.py @@ -50,7 +50,7 @@ def __recover_interfaces(dut, fanouthosts, result, wait_time): def __recover_services(dut, result): status = result['services_status'] services = [ x for x in status if not status[x] ] - logging.info("Service(s) down: {}".format(services)) + logging.warning("Service(s) down: {}".format(services)) return 'reboot' if 'database' in services else 'config_reload'