From 2b3d4623d36a1169e747c6894efa34537b1d76cb Mon Sep 17 00:00:00 2001 From: Arvindsrinivasan Lakshmi Narasimhan Date: Fri, 4 Dec 2020 02:10:18 +0000 Subject: [PATCH] sanity check changes for multi asic Signed-off-by: Arvindsrinivasan Lakshmi Narasimhan --- ansible/library/show_interface.py | 44 ++++++- ansible/library/show_ip_interface.py | 89 +++++++++++++ tests/common/devices.py | 108 +++++++++++---- tests/common/plugins/sanity_check/checks.py | 138 +++++++++++++------- tests/conftest.py | 2 +- 5 files changed, 302 insertions(+), 79 deletions(-) create mode 100644 ansible/library/show_ip_interface.py diff --git a/ansible/library/show_interface.py b/ansible/library/show_interface.py index 87ca39f5276..029f41e2db8 100644 --- a/ansible/library/show_interface.py +++ b/ansible/library/show_interface.py @@ -31,6 +31,14 @@ # Get show interface counter - show_interface: comamnd='counter' interface='Ethernet4' + # Get show interface status for all internal and external interfaces + - show_interface command='status' include_internal_intfs=True + + # Get show interface status external interfaces for namespace + - show_interface command='status' namespace='asic0' + + # Get show interface status for external and interfaces for namespace + - show_interface command='status' namespace='asic0' include_internal_intfs=True ''' RETURN = ''' @@ -65,9 +73,11 @@ class ShowInterfaceModule(object): def __init__(self): self.module = AnsibleModule( argument_spec=dict( - command=dict(required=True, type='str'), - interfaces=dict(required=False, type='list', default=None), - up_ports=dict(type='raw', default={}), + command=dict(required=True, type='str'), + namespace=dict(required=False, type='str', default=None), + interfaces=dict(required=False, type='list', default=None), + up_ports=dict(type='raw', default={}), + include_internal_intfs=dict(required=False, type=bool, default=False), ), supports_check_mode=False) self.m_args = self.module.params @@ -79,13 +89,18 @@ def run(self): """ Main method of the class """ - if self.m_args['command'] == 'status': self.collect_interface_status() - if self.m_args['command'] == 'counter': self.collect_interface_counter() + namespace = self.m_args["namespace"] + include_internal_intfs = self.m_args['include_internal_intfs'] + if self.m_args['command'] == 'status': + self.collect_interface_status(namespace, include_internal_intfs) + if self.m_args['command'] == 'counter': + self.collect_interface_counter() self.module.exit_json(ansible_facts=self.facts) - def collect_interface_status(self): + def collect_interface_status(self, namespace=None, include_internal_intfs=False): regex_int_fec = re.compile(r'(\S+)\s+[\d,N\/A]+\s+(\w+)\s+(\d+)\s+(rs|N\/A|none)\s+([\w\/]+)\s+(\w+)\s+(\w+)\s+(\w+)') regex_int = re.compile(r'(\S+)\s+[\d,N\/A]+\s+(\w+)\s+(\d+)\s+([\w\/]+)\s+(\w+)\s+(\w+)\s+(\w+)') + regex_int_internal = re.compile(r'(\S+)\s+[\d,N\/A]+\s+(\w+)\s+(\d+)\s+(rs|N\/A)\s+([\w\-]+)\s+(\w+)\s+(\w+)\s+(\w+)') self.int_status = {} if self.m_args['interfaces'] is not None: for interface in self.m_args['interfaces']: @@ -120,11 +135,16 @@ def collect_interface_status(self): self.module.fail_json(msg="Command failed rc=%d, out=%s, err=%s" % (rc, self.out, err)) else: try: - rc, self.out, err = self.module.run_command('show interface status', executable='/bin/bash', use_unsafe_shell=True) + cli_options = " -n {}".format(namespace) if namespace is not None else "" + if include_internal_intfs: + cli_options += " -d all" + intf_status_cmd = "show interface status{}".format(cli_options) + rc, self.out, err = self.module.run_command(intf_status_cmd, executable='/bin/bash', use_unsafe_shell=True) for line in self.out.split("\n"): line = line.strip() fec = regex_int_fec.match(line) old = regex_int.match(line) + internal = regex_int_internal.match(line) if fec: interface = fec.group(1) self.int_status[interface] = {} @@ -145,6 +165,16 @@ def collect_interface_status(self): self.int_status[interface]['vlan'] = old.group(5) self.int_status[interface]['oper_state'] = old.group(6) self.int_status[interface]['admin_state'] = old.group(7) + elif internal and include_internal_intfs: + interface = internal.group(1) + self.int_status[interface] = {} + self.int_status[interface]['name'] = interface + self.int_status[interface]['speed'] = internal.group(2) + self.int_status[interface]['fec'] = internal.group(4) + self.int_status[interface]['alias'] = internal.group(5) + self.int_status[interface]['vlan'] = internal.group(6) + self.int_status[interface]['oper_state'] = internal.group(7) + self.int_status[interface]['admin_state'] = internal.group(8) self.facts['int_status'] = self.int_status except Exception as e: self.module.fail_json(msg=str(e)) diff --git a/ansible/library/show_ip_interface.py b/ansible/library/show_ip_interface.py new file mode 100644 index 00000000000..6349d73ff95 --- /dev/null +++ b/ansible/library/show_ip_interface.py @@ -0,0 +1,89 @@ +#!/usr/bin/python + +from ansible.module_utils.basic import * +import re + +DOCUMENTATION = ''' +module: show_ip_interface.py +Short_description: Retrieve show ip interface +Description: + - Retrieve IPv4 address of interface and IPv4 address of its neighbor + +options: + - namespace:: + Description: In multi ASIC env, namespace to run the command + Required: False + +''' + +EXAMPLES = ''' + # Get show ip interface + - show_ip_interface: + + # Get show ip interface in namespace asic0 + - show_ip_interface: namespace='asic0' + +''' + + +class ShowIpInterfaceModule(object): + def __init__(self): + self.module = AnsibleModule( + argument_spec=dict( + namespace=dict(required=False, type='str', default=None), + ), + supports_check_mode=False + ) + self.m_args = self.module.params + self.out = None + self.facts = {} + self.ns = "" + ns = self.m_args["namespace"] + if ns is not None: + self.ns = "sudo ip netns exec {} ".format(ns) + + def run(self): + """ + Main method of the class + """ + regex_int = re.compile( + "\s*(\S+)\s+" # interface name + "(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\/(\d{1,2})\s*" # IPv4 + "(up|down)\/(up|down)\s*" # oper/admin state + "(\S+)\s*" # neighbor name + "(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|N\/A)\s*" # peer IPv4 + ) + + self.ip_int = {} + try: + rc, self.out, err = self.module.run_command( + "{}show ip interfaces".format(self.ns), + executable='/bin/bash', + use_unsafe_shell=True + ) + for line in self.out.split("\n"): + line = line.strip() + m = re.match(regex_int, line) + if m: + self.ip_int[m.group(1)] = {} + self.ip_int[m.group(1)]["ipv4"] = m.group(2) + self.ip_int[m.group(1)]["prefix_len"] = m.group(3) + self.ip_int[m.group(1)]["admin"] = m.group(4) + self.ip_int[m.group(1)]["oper_state"] = m.group(5) + self.ip_int[m.group(1)]["bgp_neighbor"] = m.group(6) + self.ip_int[m.group(1)]["peer_ipv4"] = m.group(7) + self.facts['ip_interfaces'] = self.ip_int + except Exception as e: + self.module.fail_json(msg=str(e)) + if rc != 0: + self.module.fail_json(msg="Command failed rc = %d, out = %s, err = %s" % (rc, self.out, err)) + + self.module.exit_json(ansible_facts=self.facts) + +def main(): + ShowIpInt = ShowIpInterfaceModule() + ShowIpInt.run() + return + +if __name__ == "__main__": + main() diff --git a/tests/common/devices.py b/tests/common/devices.py index d930ab6d516..2c302cb11fa 100644 --- a/tests/common/devices.py +++ b/tests/common/devices.py @@ -127,7 +127,6 @@ class SonicHost(AnsibleHostBase): and also provides the ability to run Ansible modules on the SONiC device. """ - _DEFAULT_CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"] def __init__(self, ansible_adhoc, hostname, shell_user=None, shell_passwd=None): @@ -158,8 +157,8 @@ def __init__(self, ansible_adhoc, hostname, self._facts = self._gather_facts() self._os_version = self._get_os_version() + self.is_multi_asic = True if self.facts["num_asic"] > 1 else False - self.reset_critical_services_tracking_list() @property def facts(self): @@ -217,20 +216,16 @@ def critical_services(self, var): This list is used for tracking purposes ONLY. Updating the list does not actually modify any services running on the device. """ - - if self.facts["num_asic"] > 1: - self._critical_services = self._generate_critical_services_for_multi_asic(var) - else: - self._critical_services = var + self._critical_services = var logging.debug(self._critical_services) - def reset_critical_services_tracking_list(self): + def reset_critical_services_tracking_list(self, service_list): """ - Resets the list of critical services to the default. + Resets the list of critical services. """ - self.critical_services = self._DEFAULT_CRITICAL_SERVICES + self.critical_services = service_list def _gather_facts(self): """ @@ -270,21 +265,6 @@ def _get_asic_count(self, platform): def _get_router_mac(self): return self.command("sonic-cfggen -d -v 'DEVICE_METADATA.localhost.mac'")["stdout_lines"][0].decode("utf-8") - def _generate_critical_services_for_multi_asic(self, services): - """ - Generates a fully-qualified list of critical services for multi-asic platforms, based on a - base list of services. - - Example: - ["swss", "syncd"] -> ["swss0", "swss1", "swss2", "syncd0", "syncd1", "syncd2"] - """ - - m_service = [] - for service in services: - for asic in range(self.facts["num_asic"]): - asic_service = service + str(asic) - m_service.insert(asic, asic_service) - return m_service def _get_platform_info(self): """ @@ -1121,6 +1101,9 @@ def get_route(self, prefix): cmd = 'show bgp ipv4' if ipaddress.ip_network(unicode(prefix)).version == 4 else 'show bgp ipv6' return json.loads(self.shell('vtysh -c "{} {} json"'.format(cmd, prefix))['stdout']) + def run_redis_cli_cmd(self, redis_cmd): + cmd = "/usr/bin/redis-cli {}".format(redis_cmd) + return self.command(cmd) class K8sMasterHost(AnsibleHostBase): """ @@ -1545,6 +1528,8 @@ class SonicAsic(object): The purpose is to hide the complexity of handling ASIC/namespace specific details. For example, passing asic_id, namespace, instance_id etc. to ansible module to deal with namespaces. """ + + _DEFAULT_ASIC_SERVICES = ["bgp", "database", "lldp", "swss", "syncd", "teamd"] def __init__(self, sonichost, asic_index): """ Initializing a ASIC on a SONiC host. @@ -1554,7 +1539,26 @@ def __init__(self, sonichost, asic_index): """ self.sonichost = sonichost self.asic_index = asic_index + if self.sonichost.is_multi_asic: + self.namespace = "{}{}".format(NAMESPACE_PREFIX, self.asic_index) + else: + # set the namespace to DEFAULT_NAMESPACE(None) for single asic + self.namespace = DEFAULT_NAMESPACE + def get_critical_services(self): + """This function returns the list of the critical services + for the namespace(asic) + + If the dut is multi asic, then the asic_id is appended t0 the + _DEFAULT_ASIC_SERVICES list + Returns: + [list]: list of the services running the namespace/asic + """ + a_service = [] + for service in self._DEFAULT_ASIC_SERVICES: + a_service.append("{}{}".format( + service, self.asic_index if self.sonichost.is_multi_asic else "")) + return a_service def bgp_facts(self, *module_args, **complex_args): """ Wrapper method for bgp_facts ansible module. @@ -1585,10 +1589,43 @@ def config_facts(self, *module_args, **complex_args): """ if 'host' not in complex_args: complex_args['host'] = self.sonichost.hostname - if self.sonichost.facts['num_asic'] != 1: - complex_args['namespace'] = 'asic{}'.format(self.asic_index) + if self.sonichost.is_multi_asic: + complex_args['namespace'] = self.namespace return self.sonichost.config_facts(*module_args, **complex_args) + def show_interface(self, *module_args, **complex_args): + """Wrapper for the ansible module 'show_interface' + + Args: + module_args: other ansible module args passed from the caller + complex_args: other ansible keyword args + + Returns: + [dict]: [the output of show interface status command] + """ + complex_args['namespace'] = self.namespace + return self.sonichost.show_interface(*module_args, **complex_args) + + def show_ip_interface(self, *module_args, **complex_args): + """Wrapper for the ansible module 'show_ip_interface' + + Args: + module_args: other ansible module args passed from the caller + complex_args: other ansible keyword args + + Returns: + [dict]: [the output of show interface status command] + """ + complex_args['namespace'] = self.namespace + return self.sonichost.show_ip_interface(*module_args, **complex_args) + + def run_redis_cli_cmd(self, redis_cmd): + if self.namespace: + redis_cli = "/usr/bin/redis-cli" + cmd = "sudo ip netns exec {} {} {}".format(self.namespace, redis_cli,redis_cmd) + return self.sonichost.command(cmd) + # for single asic platforms there are not Namespaces, so the redis-cli command is same the DUT host + return self.sonichost.run_redis_cli_cmd(redis_cmd) class MultiAsicSonicHost(object): """ This class represents a Multi-asic SonicHost It has two attributes: @@ -1599,6 +1636,8 @@ class MultiAsicSonicHost(object): So, even a single asic pizza box is represented as a MultiAsicSonicHost with 1 SonicAsic. """ + _DEFAULT_SERVICES = ["pmon", "snmp", "lldp", "database"] + def __init__(self, ansible_adhoc, hostname): """ Initializing a MultiAsicSonicHost. @@ -1608,6 +1647,21 @@ def __init__(self, ansible_adhoc, hostname): """ self.sonichost = SonicHost(ansible_adhoc, hostname) self.asics = [SonicAsic(self.sonichost, asic_index) for asic_index in range(self.sonichost.facts["num_asic"])] + self.critical_services_tracking_list() + + def critical_services_tracking_list(self): + """Get the list of services running on the DUT + The services on the sonic devices are: + - services running on the host + - services which are replicated per asic + Returns: + [list]: list of the services running the device + """ + service_list = [] + service_list+= self._DEFAULT_SERVICES + for asic in self.asics: + service_list += asic.get_critical_services() + self.sonichost.reset_critical_services_tracking_list(service_list) def _run_on_asics(self, *module_args, **complex_args): """ Run an asible module on asics based on 'asic_index' keyword in complex_args diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index 154662d3b73..9845fd72718 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -43,16 +43,45 @@ def check_services(dut): return check_result -def _find_down_ports(dut, interfaces): - down_ports = [] - intf_facts = dut.interface_facts()['ansible_facts'] - for intf in interfaces: +def _find_down_phy_ports(dut, phy_interfaces): + down_phy_ports = [] + intf_facts = dut.show_interface(command='status', include_internal_intfs=True)['ansible_facts']['int_status'] + for intf in phy_interfaces: + try: + if intf_facts[intf]['oper_state'] == 'down': + down_phy_ports.append(intf) + except KeyError: + down_phy_ports.append(intf) + return down_phy_ports + + +def _find_down_ip_ports(dut, ip_interfaces): + down_ip_ports = [] + ip_intf_facts = dut.show_ip_interface()['ansible_facts']['ip_interfaces'] + for intf in ip_interfaces: try: - port = intf_facts["ansible_interface_facts"][intf] - if not port["link"] or not port["active"]: - down_ports.append(intf) + if ip_intf_facts[intf]['oper_state'] == 'down': + down_ip_ports.append(intf) except KeyError: - down_ports.append(intf) + down_ip_ports.append(intf) + return down_ip_ports + + +def _find_down_ports(dut, phy_interfaces, ip_interfaces): + """Finds the ports which are operationally down + + Args: + dut (object): The sonichost/sonicasic object + phy_interfaces (list): List of all phyiscal operation in 'admin_up' + ip_interfaces (list): List of the L3 interfaces + + Returns: + [list]: list of the down ports + """ + down_ports = [] + down_ports = _find_down_ip_ports(dut, ip_interfaces) + \ + _find_down_phy_ports(dut, phy_interfaces) + return down_ports @@ -64,37 +93,44 @@ def check_interfaces(dut): interval = 20 logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \ (networking_uptime, timeout, interval)) - - cfg_facts = dut.config_facts(host=dut.hostname, source="persistent")['ansible_facts'] - interfaces = [k for k,v in cfg_facts["PORT"].items() if "admin_status" in v and v["admin_status"] == "up"] - if "PORTCHANNEL_INTERFACE" in cfg_facts: - interfaces += cfg_facts["PORTCHANNEL_INTERFACE"].keys() - if "VLAN_INTERFACE" in cfg_facts: - interfaces += cfg_facts["VLAN_INTERFACE"].keys() - - logger.info(json.dumps(interfaces, indent=4)) - + + down_ports = [] check_result = {"failed": True, "check_item": "interfaces"} - if timeout == 0: # Check interfaces status, do not retry. - down_ports = _find_down_ports(dut, interfaces) - check_result["failed"] = True if len(down_ports) > 0 else False - check_result["down_ports"] = down_ports - else: # Retry checking interface status - start = time.time() - elapsed = 0 - while elapsed < timeout: - down_ports = _find_down_ports(dut, interfaces) + for asic in dut.asics: + ip_interfaces = [] + cfg_facts = asic.config_facts(host=dut.hostname, + source="persistent")['ansible_facts'] + phy_interfaces = [k for k, v in cfg_facts["PORT"].items() if "admin_status" in v and v["admin_status"] == "up"] + if "PORTCHANNEL_INTERFACE" in cfg_facts: + ip_interfaces = cfg_facts["PORTCHANNEL_INTERFACE"].keys() + if "VLAN_INTERFACE" in cfg_facts: + ip_interfaces += cfg_facts["VLAN_INTERFACE"].keys() + + logger.info(json.dumps(phy_interfaces, indent=4)) + logger.info(json.dumps(ip_interfaces, indent=4)) + + if timeout == 0: # Check interfaces status, do not retry. + down_ports += _find_down_ports(asic, phy_interfaces, ip_interfaces) check_result["failed"] = True if len(down_ports) > 0 else False check_result["down_ports"] = down_ports - - if check_result["failed"]: - wait(interval, msg="Found down ports, wait %d seconds to retry. Remaining time: %d, down_ports=%s" % \ - (interval, int(timeout - elapsed), str(check_result["down_ports"]))) - elapsed = time.time() - start - else: - break + else: # Retry checking interface status + start = time.time() + elapsed = 0 + while elapsed < timeout: + down_ports = _find_down_ports(asic, phy_interfaces, ip_interfaces) + check_result["failed"] = True if len(down_ports) > 0 else False + check_result["down_ports"] = down_ports + + if check_result["failed"]: + wait(interval, msg="Found down ports, wait %d seconds to retry. Remaining time: %d, down_ports=%s" % \ + (interval, int(timeout - elapsed), str(check_result["down_ports"]))) + elapsed = time.time() - start + else: + break logger.info("Done checking interfaces status.") + check_result["failed"] = True if len(down_ports) > 0 else False + check_result["down_ports"] = down_ports return check_result @@ -151,24 +187,38 @@ def _check_bgp_status_helper(): logger.info("Done checking bgp status on %s" % dut.hostname) return check_result -def check_dbmemory(dut): - logger.info("Checking database memory on %s..." % dut.hostname) + +def _is_db_omem_over_threshold(command_output): total_omem = 0 re_omem = re.compile("omem=(\d+)") - res = dut.command("/usr/bin/redis-cli client list") - for l in res['stdout_lines']: - m = re_omem.search(l) + result = False + + for line in command_output: + m = re_omem.search(line) if m: omem = int(m.group(1)) total_omem += omem - - logger.info(json.dumps(res['stdout_lines'], indent=4)) - check_result = {"failed": False, "check_item": "dbmemory"} + logger.info(json.dumps(command_output, indent=4)) if total_omem > OMEM_THRESHOLD_BYTES: - check_result["failed"] = True - check_result["total_omem"] = total_omem + result = True + + return result, total_omem + +def check_dbmemory(dut): + logger.info("Checking database memory on %s..." % dut.hostname) + redis_cmd = "client list" + check_result = {"failed": False, "check_item": "dbmemory"} + # check the db memory on the redis instance running on each instance + for asic in dut.asics: + res = asic.run_redis_cli_cmd(redis_cmd)['stdout_lines'] + result, total_omem = _is_db_omem_over_threshold(res) + if result: + check_result["failed"] = True + check_result["total_omem"] = total_omem + logging.info("{} db memory over the threshold ".format(str(asic.namespace or ''))) + break logger.info("Done checking database memory") return check_result diff --git a/tests/conftest.py b/tests/conftest.py index e6f794e2f09..4af1becda57 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -197,7 +197,7 @@ def reset_critical_services_list(duthosts): Resets the critical services list between test modules to ensure that it is left in a known state after tests finish running. """ - [a_dut.reset_critical_services_tracking_list() for a_dut in duthosts] + [a_dut.critical_services_tracking_list() for a_dut in duthosts] @pytest.fixture(scope="session") def localhost(ansible_adhoc):