diff --git a/ansible/roles/test/files/helpers/arp_responder.py b/ansible/roles/test/files/helpers/arp_responder.py index d5686c115c8..89dce78f417 100644 --- a/ansible/roles/test/files/helpers/arp_responder.py +++ b/ansible/roles/test/files/helpers/arp_responder.py @@ -3,6 +3,8 @@ import struct import select import json +import argparse +import os.path from fcntl import ioctl from pprint import pprint @@ -34,7 +36,7 @@ def __init__(self, iface): def __del__(self): if self.socket: - self.socket.close() + self.socket.close() def bind(self): self.socket = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, socket.htons(self.ETH_P_ALL)) @@ -93,7 +95,7 @@ def action(self, interface): if request_ip_str not in self.ip_sets[interface.name()]: return - arp_reply = self.generate_arp_reply(interface.mac(), remote_mac, request_ip, remote_ip) + arp_reply = self.generate_arp_reply(self.ip_sets[interface.name()][request_ip_str], remote_mac, request_ip, remote_ip) interface.send(arp_reply) return @@ -104,12 +106,36 @@ def extract_arp_info(self, data): def generate_arp_reply(self, local_mac, remote_mac, local_ip, remote_ip): return remote_mac + local_mac + self.arp_chunk + local_mac + local_ip + remote_mac + remote_ip + self.arp_pad +def parse_args(): + parser = argparse.ArgumentParser(description='ARP autoresponder') + parser.add_argument('--conf', '-c', type=str, dest='conf', default='/tmp/from_t1.json', help='path to json file with configuration') + parser.add_argument('--extended', '-e', action='store_true', dest='extended', default=False, help='enable extended mode') + args = parser.parse_args() + + return args def main(): - with open('/tmp/from_t1.json') as fp: + args = parse_args() + + if not os.path.exists(args.conf): + print "Can't find file %s" % args.conf + return + + with open(args.conf) as fp: data = json.load(fp) - ip_sets = {str(k): set(v) for k, v in data.items()} + # generate ip_sets. every ip address will have it's own uniq mac address + ip_sets = {} + counter = 0 + for iface, ip_dict in data.items(): + ip_sets[str(iface)] = {} + if args.extended: + for ip, mac in ip_dict.items(): + ip_sets[str(iface)][str(ip)] = binascii.unhexlify(str(mac)) + counter += 1 + else: + for ip in ip_dict: + ip_sets[str(iface)][str(ip)] = get_mac(iface) ifaces = [] for iface_name in ip_sets.keys(): diff --git a/ansible/roles/test/files/ptftests/fast-reboot.py b/ansible/roles/test/files/ptftests/fast-reboot.py index f3c3d0c741d..309a9485f3c 100644 --- a/ansible/roles/test/files/ptftests/fast-reboot.py +++ b/ansible/roles/test/files/ptftests/fast-reboot.py @@ -1,18 +1,34 @@ # -#ptf --test-dir ptftests fast-reboot.FastReloadTest --platform remote --platform-dir ptftests --qlen 1000 -t "verbose=True;dut_username='acsadmin';dut_hostname='10.3.147.243';fast_reboot_limit=30;portchannel_ports_file='/tmp/portchannel_interfaces.json';vlan_ports_file='/tmp/vlan_interfaces.json';ports_file='/tmp/ports.json';dut_mac='4c:76:25:f4:b7:00';vlan_ip_range='172.0.0.0/26';default_ip_range='192.168.0.0/16';vlan_ip_range='172.0.0.0/26'" +#ptf --test-dir ptftests fast-reboot --qlen=1000 --platform remote -t 'verbose=True;dut_username="admin";dut_hostname="10.0.0.243";fast_reboot_limit=30;portchannel_ports_file="/tmp/portchannel_interfaces.json";vlan_ports_file="/tmp/vlan_interfaces.json";ports_file="/tmp/ports.json";dut_mac="4c:76:25:f5:48:80";default_ip_range="192.168.0.0/16";vlan_ip_range="172.0.0.0/22";arista_vms="[\"10.0.0.200\",\"10.0.0.201\",\"10.0.0.202\",\"10.0.0.203\"]"' --platform-dir ptftests --disable-vxlan --disable-geneve --disable-erspan --disable-mpls --disable-nvgre # # -# This test measures length of DUT dataplane disruption in fast-reboot procedure. +# This test checks that DUT is able to make FastReboot procedure # # This test supposes that fast-reboot initiates by running /usr/bin/fast-reboot command. -# The test sequence are following: -# 1. Check that DUT is stable. That means that pings work in both directions: from T1 to servers and from servers to T1. +# +# The test uses "pings". The "pings" are packets which are sent through dataplane in two directions +# 1. From one of vlan interfaces to T1 device. The source ip, source interface, and destination IP are chosen randomly from valid choices. Number of packet is 100. +# 2. From all of portchannel ports to all of vlan ports. The source ip, source interface, and destination IP are chosed sequentially from valid choices. +# Currently we have 500 distrinct destination vlan addresses. Our target to have 1000 of them. +# +# The test sequence is following: +# 1. Check that DUT is stable. That means that "pings" work in both directions: from T1 to servers and from servers to T1. # 2. If DUT is stable the test starts continiously pinging DUT in both directions. # 3. The test runs '/usr/bin/fast-reboot' on DUT remotely. The ssh key supposed to be uploaded by ansible before the test -# 3. As soon as it sees that ping starts failuring in one of directions the test registers a start of dataplace disruption -# 4. As soon as the test sess that pings start working for DUT in both directions it registers a stop of dataplane disruption -# 5. If the length of the disruption is less 30 seconds - the test passes - +# 4. As soon as it sees that ping starts failuring in one of directions the test registers a start of dataplace disruption +# 5. As soon as the test sees that pings start working for DUT in both directions it registers a stop of dataplane disruption +# 6. If the length of the disruption is less than 30 seconds (if not redefined by parameter) - the test passes +# 7. If there're any drops, when control plane is down - the test fails +# 8. When test start fast-reboot procedure it connects to all VM (which emulates T1) and starts fetching status of BGP and LACP +# LACP is supposed to be down for one time only, if not - the test fails +# if default value of BGP graceful restart timeout is less than 120 seconds the test fails +# if BGP graceful restart is not enabled on DUT the test fails +# If BGP graceful restart timeout value is almost exceeded (less than 15 seconds) the test fails +# if BGP routes disappeares more then once, the test failed +# +# The test expects you're running the test with link state propagation helper. +# That helper propagate a link state from fanout switch port to corresponding VM port +# import ptf from ptf.base_tests import BaseTest @@ -39,29 +55,332 @@ import re from collections import defaultdict import json +import paramiko +import Queue +import pickle +from operator import itemgetter + + +class Arista(object): + DEBUG = False + def __init__(self, ip, queue, test_params, login='admin', password='123456'): + self.ip = ip + self.queue = queue + self.login = login + self.password = password + self.conn = None + self.hostname = None + self.v4_routes = [test_params['vlan_ip_range'], test_params['lo_prefix']] + self.v6_routes = [test_params['lo_v6_prefix']] + self.fails = set() + self.min_bgp_gr_timeout = int(test_params['min_bgp_gr_timeout']) + + def __del__(self): + self.disconnect() + + def connect(self): + self.conn = paramiko.SSHClient() + self.conn.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + self.conn.connect(self.ip, username=self.login, password=self.password, allow_agent=False, look_for_keys=False) + self.shell = self.conn.invoke_shell() + + first_prompt = self.do_cmd(None, prompt = '>') + self.hostname = self.extract_hostname(first_prompt) + + self.do_cmd('enable') + self.do_cmd('terminal length 0') + + return self.shell + + def extract_hostname(self, first_prompt): + lines = first_prompt.split('\n') + prompt = lines[-1] + return prompt.strip().replace('>', '#') + + def do_cmd(self, cmd, prompt = None): + if prompt == None: + prompt = self.hostname + + if cmd is not None: + self.shell.send(cmd + '\n') + + input_buffer = '' + while prompt not in input_buffer: + input_buffer += self.shell.recv(16384) + + return input_buffer + + def disconnect(self): + if self.conn is not None: + self.conn.close() + self.conn = None + + return + + def run(self): + data = {} + debug_data = {} + run_once = False + log_first_line = None + quit_enabled = False + routing_works = True + self.connect() + while not (quit_enabled and v4_routing_ok and v6_routing_ok): + cmd = self.queue.get() + if cmd == 'quit': + quit_enabled = True + continue + cur_time = time.time() + info = {} + debug_info = {} + lacp_output = self.do_cmd('show lacp neighbor') + info['lacp'] = self.parse_lacp(lacp_output) + bgp_neig_output = self.do_cmd('show ip bgp neighbors') + info['bgp_neig'] = self.parse_bgp_neighbor(bgp_neig_output) + + bgp_route_v4_output = self.do_cmd('show ip route bgp | json') + v4_routing_ok = self.parse_bgp_route(bgp_route_v4_output, self.v4_routes) + info['bgp_route_v4'] = v4_routing_ok + + bgp_route_v6_output = self.do_cmd("show ipv6 route bgp | json") + v6_routing_ok = self.parse_bgp_route(bgp_route_v6_output, self.v6_routes) + info["bgp_route_v6"] = v6_routing_ok + + if not run_once: + self.ipv4_gr_enabled, self.ipv6_gr_enabled, self.gr_timeout = self.parse_bgp_neighbor_once(bgp_neig_output) + log_first_line = "session_begins_%f" % cur_time + self.do_cmd("send log message %s" % log_first_line) + run_once = True + + data[cur_time] = info + if self.DEBUG: + debug_data[cur_time] = { + 'show lacp neighbor' : lacp_output, + 'show ip bgp neighbors' : bgp_neig_output, + 'show ip route bgp' : bgp_route_v4_output, + 'show ipv6 route bgp' : bgp_route_v4_output, + } + + attempts = 15 + for _ in range(attempts): + log_output = self.do_cmd("show log | begin %s" % log_first_line) + log_lines = log_output.split("\r\n")[1:-1] + log_data = self.parse_logs(log_lines) + if len(log_data) != 0: + break + time.sleep(1) # wait until logs are populated + + self.disconnect() + + # save data for troubleshooting + with open("/tmp/%s.data.pickle" % self.ip, "w") as fp: + pickle.dump(data, fp) + + # save debug data for troubleshooting + if self.DEBUG: + with open("/tmp/%s.raw.pickle" % self.ip, "w") as fp: + pickle.dump(debug_data, fp) + with open("/tmp/%s.logging" % self.ip, "w") as fp: + fp.write("\n".join(log_lines)) + + self.check_gr_peer_status(data) + cli_data = {} + cli_data['lacp'] = self.check_series_status(data, "lacp", "LACP session") + cli_data['bgp_v4'] = self.check_series_status(data, "bgp_route_v4", "BGP v4 routes") + cli_data['bgp_v6'] = self.check_series_status(data, "bgp_route_v6", "BGP v6 routes") + + return self.fails, cli_data, log_data + + def extract_from_logs(self, regexp, data): + raw_data = [] + result = defaultdict(list) + initial_time = -1 + re_compiled = re.compile(regexp) + for line in data: + m = re_compiled.match(line) + if not m: + continue + raw_data.append((datetime.datetime.strptime(m.group(1), "%b %d %X"), m.group(2), m.group(3))) + + if len(raw_data) > 0: + initial_time = raw_data[0][0] + for when, what, status in raw_data: + offset = (when - initial_time if when > initial_time else initial_time - when).seconds + result[what].append((offset, status)) + + return result, initial_time + + def parse_logs(self, data): + result = {} + bgp_r = r'^(\S+ \d+ \S+) \S+ Rib: %BGP-5-ADJCHANGE: peer (\S+) .+ (\S+)$' + result_bgp, initial_time_bgp = self.extract_from_logs(bgp_r, data) + if_r = r'^(\S+ \d+ \S+) \S+ Ebra: %LINEPROTO-5-UPDOWN: Line protocol on Interface (\S+), changed state to (\S+)$' + result_if, initial_time_if = self.extract_from_logs(if_r, data) + + if initial_time_bgp == -1 or initial_time_if == -1: + return result + + for events in result_bgp.values(): + if events[-1][1] != 'Established': + return result + + # first state is Idle, last state is Established + for events in result_bgp.values(): + assert(events[0][1] == 'Idle') + assert(events[-1][1] == 'Established') + # first state is down, last state is up + for events in result_if.values(): + assert(events[0][1] == 'down') + assert(events[-1][1] == 'up') + + po_name = [ifname for ifname in result_if.keys() if 'Port-Channel' in ifname][0] + neigh_ipv4 = [neig_ip for neig_ip in result_bgp.keys() if '.' in neig_ip][0] + + result['PortChannel was down (seconds)'] = result_if[po_name][-1][0] - result_if[po_name][0][0] + for if_name in sorted(result_if.keys()): + result['Interface %s was down (times)' % if_name] = map(itemgetter(1), result_if[if_name]).count("down") + + for neig_ip in result_bgp.keys(): + key = "BGP IPv6 was down (seconds)" if ':' in neig_ip else "BGP IPv4 was down (seconds)" + result[key] = result_bgp[neig_ip][-1][0] - result_bgp[neig_ip][0][0] + + for neig_ip in result_bgp.keys(): + key = "BGP IPv6 was down (times)" if ':' in neig_ip else "BGP IPv4 was down (times)" + result[key] = map(itemgetter(1), result_bgp[neig_ip]).count("Idle") + + bgp_po_offset = (initial_time_if - initial_time_bgp if initial_time_if > initial_time_bgp else initial_time_bgp - initial_time_if).seconds + result['PortChannel went down after bgp session was down (seconds)'] = bgp_po_offset + result_if[po_name][0][0] + + for neig_ip in result_bgp.keys(): + key = "BGP IPv6 was gotten up after Po was up (seconds)" if ':' in neig_ip else "BGP IPv4 was gotten up after Po was up (seconds)" + result[key] = result_bgp[neig_ip][-1][0] - bgp_po_offset - result_if[po_name][-1][0] + + return result + + def parse_lacp(self, output): + return output.find('Bundled') != -1 + + def parse_bgp_neighbor_once(self, output): + is_gr_ipv4_enabled = False + is_gr_ipv6_enabled = False + restart_time = None + for line in output.split('\n'): + if ' Restart-time is' in line: + restart_time = int(line.replace(' Restart-time is ', '')) + continue + + if 'is enabled, Forwarding State is' in line: + if 'IPv6' in line: + is_gr_ipv6_enabled = True + elif 'IPv4' in line: + is_gr_ipv4_enabled = True + + return is_gr_ipv4_enabled, is_gr_ipv6_enabled, restart_time + + def parse_bgp_neighbor(self, output): + gr_active = None + gr_timer = None + for line in output.split('\n'): + if 'Restart timer is' in line: + gr_active = 'is active' in line + gr_timer = str(line[-9:-1]) + + return gr_active, gr_timer + + def parse_bgp_route(self, output, expects): + prefixes = set() + data = "\n".join(output.split("\r\n")[1:-1]) + obj = json.loads(data) + + if "vrfs" in obj and "default" in obj["vrfs"]: + obj = obj["vrfs"]["default"] + for prefix, attrs in obj["routes"].items(): + if "routeAction" not in attrs or attrs["routeAction"] != "forward": + continue + if all("Port-Channel" in via["interface"] for via in attrs["vias"]): + prefixes.add(prefix) + + return set(expects) == prefixes + + def check_gr_peer_status(self, output): + # [0] True 'ipv4_gr_enabled', [1] doesn't matter 'ipv6_enabled', [2] should be >= 120 + if not self.ipv4_gr_enabled: + self.fails.add("bgp ipv4 graceful restart is not enabled") + if not self.ipv6_gr_enabled: + pass # ToDo: + if self.gr_timeout < 120: # bgp graceful restart timeout less then 120 seconds + self.fails.add("bgp graceful restart timeout is less then 120 seconds") + + for when, other in sorted(output.items(), key = lambda x : x[0]): + gr_active, timer = other['bgp_neig'] + # wnen it's False, it's ok, wnen it's True, check that inactivity timer not less then self.min_bgp_gr_timeout seconds + if gr_active and datetime.datetime.strptime(timer, '%H:%M:%S') < datetime.datetime(1900, 1, 1, second = self.min_bgp_gr_timeout): + self.fails.add("graceful restart timer is almost finished. Less then %d seconds left" % self.min_bgp_gr_timeout) + + def check_series_status(self, output, entity, what): + # find how long anything was down + # Input parameter is a dictionary when:status + # constraints: + # entity must be down just once + # entity must be up when the test starts + # entity must be up when the test stops + + sorted_keys = sorted(output.keys()) + if not output[sorted_keys[0]][entity]: + self.fails.add("%s must be up when the test starts" % what) + return 0, 0 + if not output[sorted_keys[-1]][entity]: + self.fails.add("%s must be up when the test stops" % what) + return 0, 0 + + start = sorted_keys[0] + cur_state = True + res = defaultdict(list) + for when in sorted_keys[1:]: + if cur_state != output[when][entity]: + res[cur_state].append(when - start) + start = when + cur_state = output[when][entity] + res[cur_state].append(when - start) + + is_down_count = len(res[False]) + + if is_down_count > 1: + self.fails.add("%s must be down just for once" % what) + + return is_down_count, sum(res[False]) # summary_downtime class FastReloadTest(BaseTest): + TIMEOUT = 0.5 def __init__(self): BaseTest.__init__(self) + self.fails = {} + self.cli_info = {} + self.logs_info = {} self.log_fp = open('/tmp/fast-reboot.log', 'w') self.test_params = testutils.test_params_get() self.check_param('verbose', False, required = False) self.check_param('dut_username', '', required = True) self.check_param('dut_hostname', '', required = True) self.check_param('fast_reboot_limit', 30, required = False) + self.check_param('graceful_limit', 120, required = False) self.check_param('portchannel_ports_file', '', required = True) self.check_param('vlan_ports_file', '', required = True) self.check_param('ports_file', '', required = True) self.check_param('dut_mac', '', required = True) self.check_param('default_ip_range', '', required = True) self.check_param('vlan_ip_range', '', required = True) + self.check_param('lo_prefix', '10.1.0.32/32', required = False) + self.check_param('lo_v6_prefix', 'fc00:1::32/128', required = False) + self.check_param('arista_vms', [], required = True) + self.check_param('min_bgp_gr_timeout', 15, required = False) # Default settings self.nr_pc_pkts = 100 self.nr_tests = 3 self.reboot_delay = 10 self.task_timeout = 300 # Wait up to 5 minutes for tasks to complete - self.max_nr_vl_pkts = 1000 + self.max_nr_vl_pkts = 500 # FIXME: should be 1000. But bcm asic is not stable self.timeout_thr = None return @@ -151,11 +470,6 @@ def setUp(self): self.dut_ssh = self.test_params['dut_username'] + '@' + self.test_params['dut_hostname'] self.dut_mac = self.test_params['dut_mac'] # - self.from_server_src_addr = self.random_ip(vlan_ip_range) - self.from_server_src_port = self.random_port(self.vlan_ports) - self.from_server_dst_addr = self.random_ip(self.test_params['default_ip_range']) - self.from_server_dst_ports = self.portchannel_ports - self.nr_vl_pkts = self.generate_from_t1() self.log("Test params:") @@ -204,29 +518,35 @@ def get_mac(self, iff): def generate_from_t1(self): self.from_t1 = [] - self.ip_addr = [] vlan_ip_range = self.test_params['vlan_ip_range'] _, mask = vlan_ip_range.split('/') n_hosts = min(2**(32 - int(mask)) - 3, self.max_nr_vl_pkts) + dump = defaultdict(dict) + counter = 0 for i in xrange(2, n_hosts + 2): from_t1_src_addr = self.random_ip(self.test_params['default_ip_range']) from_t1_src_port = self.random_port(self.portchannel_ports) from_t1_dst_addr = self.host_ip(vlan_ip_range, i) - from_t1_dst_ports = self.vlan_ports[i % len(self.vlan_ports)] - from_t1_if_name = "eth%d" % from_t1_dst_ports + from_t1_dst_port = self.vlan_ports[i % len(self.vlan_ports)] + from_t1_if_name = "eth%d" % from_t1_dst_port from_t1_if_addr = "%s/%s" % (from_t1_dst_addr, vlan_ip_range.split('/')[1]) + vlan_mac_hex = '72060001%04x' % counter + lag_mac_hex = '5c010203%04x' % counter + mac_addr = ':'.join(lag_mac_hex[i:i+2] for i in range(0, len(lag_mac_hex), 2)) packet = simple_tcp_packet( + eth_src=mac_addr, eth_dst=self.dut_mac, ip_src=from_t1_src_addr, ip_dst=from_t1_dst_addr, ip_ttl=255, tcp_dport=5000 ) - self.from_t1.append((from_t1_src_port, from_t1_dst_ports, str(packet))) - self.ip_addr.append((from_t1_if_name, from_t1_if_addr)) + self.from_t1.append((from_t1_src_port, str(packet))) + dump[from_t1_if_name][from_t1_dst_addr] = vlan_mac_hex + counter += 1 exp_packet = simple_tcp_packet( ip_src="0.0.0.0", @@ -245,16 +565,32 @@ def generate_from_t1(self): # save data for arp_replay process with open("/tmp/from_t1.json", "w") as fp: - d = defaultdict(list) - for e in self.ip_addr: - d[e[0]].append(e[1].split('/')[0]) - json.dump(d, fp) + json.dump(dump, fp) + + random_vlan_iface = random.choice(dump.keys()) + self.from_server_src_port = int(random_vlan_iface.replace('eth','')) + self.from_server_src_addr = random.choice(dump[random_vlan_iface].keys()) + self.from_server_dst_addr = self.random_ip(self.test_params['default_ip_range']) + self.from_server_dst_ports = self.portchannel_ports return n_hosts def runTest(self): + self.reboot_start = None no_routing_start = None no_routing_stop = None + + arista_vms = self.test_params['arista_vms'][1:-1].split(",") + ssh_targets = [vm[1:-1] for vm in arista_vms] + + self.ssh_jobs = [] + for addr in ssh_targets: + q = Queue.Queue() + thr = threading.Thread(target=self.peer_state_check, kwargs={'ip': addr, 'queue': q}) + thr.setDaemon(True) + self.ssh_jobs.append((thr, q)) + thr.start() + thr = threading.Thread(target=self.background) thr.setDaemon(True) self.log("Check that device is alive and pinging") @@ -265,23 +601,108 @@ def runTest(self): self.log("Wait until ASIC stops") self.timeout(self.task_timeout, "DUT hasn't stopped in %d seconds" % self.task_timeout) - no_routing_start = self.check_stop() + no_routing_start, upper_replies = self.check_forwarding_stop() self.cancel_timeout() self.log("ASIC was stopped, Waiting until it's up. Stop time: %s" % str(no_routing_start)) self.timeout(self.task_timeout, "DUT hasn't started to work for %d seconds" % self.task_timeout) - no_routing_stop = self.check_start() + no_routing_stop, _ = self.check_forwarding_resume() self.cancel_timeout() - self.log("ASIC works again. Start time: %s" % str(no_routing_stop)) + # wait until all bgp session are established + self.log("Wait until bgp routing is up on all devices") + for _, q in self.ssh_jobs: + q.put('quit') + + self.timeout(self.task_timeout, "SSH threads haven't finished for %d seconds" % self.task_timeout) + while any(thr.is_alive() for thr, _ in self.ssh_jobs): + for _, q in self.ssh_jobs: + q.put('go') + time.sleep(self.TIMEOUT) + + for thr, _ in self.ssh_jobs: + thr.join() + self.cancel_timeout() + + self.log("ASIC works again. Start time: %s" % str(no_routing_stop)) + self.log("") + + no_cp_replies = self.extract_no_cpu_replies(upper_replies) + + self.fails['dut'] = set() + if no_routing_stop - no_routing_start > self.limit: + self.fails['dut'].add("Downtime must be less then %s seconds. It was %s" \ + % (self.test_params['fast_reboot_limit'], str(no_routing_stop - no_routing_start))) + if no_routing_stop - self.reboot_start > datetime.timedelta(seconds=self.test_params['graceful_limit']): + self.fails['dut'].add("Fast-reboot cycle must be less than graceful limit %s seconds" % self.test_params['graceful_limit']) + if no_cp_replies < 0.95 * self.nr_vl_pkts: + self.fails['dut'].add("Dataplane didn't route to all servers, when control-plane was down: %d vs %d" % (no_cp_replies, self.nr_vl_pkts)) + + # Generating report + self.log("="*50) + self.log("Report:") + self.log("="*50) + + self.log("LACP/BGP were down for (extracted from cli):") + self.log("-"*50) + for ip in sorted(self.cli_info.keys()): + self.log(" %s - lacp: %7.3f (%d) bgp v4: %7.3f (%d) bgp v6: %7.3f (%d)" \ + % (ip, self.cli_info[ip]['lacp'][1], self.cli_info[ip]['lacp'][0], \ + self.cli_info[ip]['bgp_v4'][1], self.cli_info[ip]['bgp_v4'][0],\ + self.cli_info[ip]['bgp_v6'][1], self.cli_info[ip]['bgp_v6'][0])) + + self.log("-"*50) + self.log("Extracted from VM logs:") + self.log("-"*50) + for ip in sorted(self.logs_info.keys()): + self.log("Extracted log info from %s" % ip) + for msg in sorted(self.logs_info[ip].keys()): + self.log(" %s : %d" % (msg, self.logs_info[ip][msg])) + self.log("-"*50) + + self.log("Summary:") + self.log("-"*50) self.log("Downtime was %s" % str(no_routing_stop - no_routing_start)) + self.log("Reboot time was %s" % str(no_routing_stop - self.reboot_start)) + + + self.log("How many packets were received back when control plane was down: %d Expected: %d" % (no_cp_replies, self.nr_vl_pkts)) + + is_good = all(len(fails) == 0 for fails in self.fails.values()) + + if not is_good: + self.log("-"*50) + self.log("Fails:") + self.log("-"*50) + + errors = "\n\nSomething went wrong. Please check output below:\n\n" + for name, fails in self.fails.items(): + for fail in fails: + self.log("FAILED:%s:%s" % (name, fail)) + errors += "FAILED:%s:%s\n" % (name, fail) - self.assertTrue(no_routing_stop - no_routing_start < self.limit, "Downtime must be less then %s seconds" % self.test_params['fast_reboot_limit']) + self.log("="*50) + + self.assertTrue(is_good, errors) + + def extract_no_cpu_replies(self, arr): + """ + This function tries to extract number of replies from dataplane, when control plane is non working + """ + # remove all tail zero values + non_zero = filter(lambda x : x > 0, arr) + + # check that last value is different from previos + if len(non_zero) > 1 and non_zero[-1] < non_zero[-2]: + return non_zero[-2] + else: + return non_zero[-1] def background(self): time.sleep(self.reboot_delay) self.log("Rebooting remote side") + self.reboot_start = datetime.datetime.now() stdout, stderr, return_code = self.cmd(["ssh", "-oStrictHostKeyChecking=no", self.dut_ssh, "sudo fast-reboot"]) if stdout != []: self.log("stdout from fast-reboot: %s" % str(stdout)) @@ -301,22 +722,30 @@ def cmd(self, cmds): return stdout, stderr, return_code - def check_stop(self): + def peer_state_check(self, ip, queue): + ssh = Arista(ip, queue, self.test_params) + self.fails[ip], self.cli_info[ip], self.logs_info[ip] = ssh.run() + + def check_forwarding_stop(self): return self.iteration(True) - def check_start(self): + def check_forwarding_resume(self): return self.iteration(False) def iteration(self, is_stop): recorded_time = None counter = self.nr_tests + nr_from_upper_array = [] while True: - success = self.ping_iteration() + success, nr_from_upper = self.ping_iteration() + nr_from_upper_array.append(nr_from_upper) + for _, q in self.ssh_jobs: + q.put('go') if success and is_stop or not success and not is_stop: - self.log("Success", True) + self.log("Base state", True) recorded_time = None else: - self.log("Not Success", True) + self.log("Changed state", True) if recorded_time is None: recorded_time = datetime.datetime.now() if counter == 0: @@ -324,36 +753,52 @@ def iteration(self, is_stop): else: counter -= 1 - return recorded_time + return recorded_time, nr_from_upper_array def ping_iteration(self): - return self.pingFromServers() > 0 and self.pingFromUpperTier() > 0 + replies_from_servers = self.pingFromServers() + if replies_from_servers > 0: + replies_from_upper = self.pingFromUpperTier() + else: + replies_from_upper = 0 + return replies_from_servers > 0 and replies_from_upper > 0, replies_from_upper def check_alive(self): # This function checks that DUT routes packets in both directions. # # Sometimes first attempt failes because ARP response to DUT is not so fast. - # But after this the functions expects to see "replies" on at least 50% of requests. + # But after this the functions expects to see steady "replies". # If the function sees that there is some issue with dataplane after we see successful replies # it consider that DUT is not healthy too - was_alive = 0 + # + # Sometimes I see that DUT returns more replies then requests. + # I think this is because of not populated FDB table + # The function waits while it's done + + was_alive = False for counter in range(self.nr_tests * 2): - success = self.ping_alive() + success, _ = self.ping_alive() if success: - was_alive += 1 + was_alive = True else: - if was_alive > 0: + if was_alive: return False # Stopped working after it working for sometime? - return was_alive > self.nr_tests + # wait, until FDB entries are populated + while self.ping_alive()[1]: + pass + + return True def ping_alive(self): nr_from_s = self.pingFromServers() nr_from_l = self.pingFromUpperTier() - is_success_from_s = nr_from_s > self.nr_pc_pkts * 0.7 - is_success_from_l = nr_from_l > self.nr_vl_pkts * 0.7 - return is_success_from_s and is_success_from_l + is_alive = nr_from_s > self.nr_pc_pkts * 0.7 and nr_from_l > self.nr_vl_pkts * 0.7 + is_asic_weird = nr_from_s > self.nr_pc_pkts or nr_from_l > self.nr_vl_pkts + # we receive more, then received. not populated FDB table + + return is_alive, is_asic_weird def pingFromServers(self): packet = simple_tcp_packet( @@ -378,7 +823,7 @@ def pingFromServers(self): for i in xrange(self.nr_pc_pkts): testutils.send_packet(self, self.from_server_src_port, raw_packet) - total_rcv_pkt_cnt = testutils.count_matched_packets_all_ports(self, exp_packet, self.from_server_dst_ports) + total_rcv_pkt_cnt = testutils.count_matched_packets_all_ports(self, exp_packet, self.from_server_dst_ports, timeout=self.TIMEOUT) self.log("Send %5d Received %5d servers->t1" % (self.nr_pc_pkts, total_rcv_pkt_cnt), True) @@ -386,9 +831,9 @@ def pingFromServers(self): def pingFromUpperTier(self): for entry in self.from_t1: - testutils.send_packet(self, entry[0], entry[2]) + testutils.send_packet(self, *entry) - total_rcv_pkt_cnt = testutils.count_matched_packets_all_ports(self, self.from_t1_exp_packet, self.vlan_ports) + total_rcv_pkt_cnt = testutils.count_matched_packets_all_ports(self, self.from_t1_exp_packet, self.vlan_ports, timeout=self.TIMEOUT) self.log("Send %5d Received %5d t1->servers" % (self.nr_vl_pkts, total_rcv_pkt_cnt), True) diff --git a/ansible/roles/test/tasks/fast-reboot.yml b/ansible/roles/test/tasks/fast-reboot.yml index 37b44964e88..2756dd56021 100644 --- a/ansible/roles/test/tasks/fast-reboot.yml +++ b/ansible/roles/test/tasks/fast-reboot.yml @@ -1,7 +1,13 @@ +# example: +# ansible-playbook sonic-test.yml -i str --limit device_1 --become --vault-password-file ~/password --tags fast_reboot -e "ptf_host=10.0.0.21" -e "vm_hosts=['10.0.0.200','10.0.0.201','10.0.0.202','10.0.0.203']" + - block: - fail: msg="Please set ptf_host variable" when: ptf_host is not defined + - fail: msg="Please set vm_hosts variable with a list of VMs" + when: vm_hosts is not defined + - name: Remove existing ip from ptf host script: roles/test/files/helpers/remove_ip.sh delegate_to: "{{ ptf_host }}" @@ -19,7 +25,9 @@ delegate_to: "{{ ptf_host }}" - name: Copy arp responder supervisor configuration to the PTF container - copy: src=roles/test/files/supervisor/arp_responder.conf dest=/etc/supervisor/conf.d + template: src=arp_responder.conf.j2 dest=/etc/supervisor/conf.d/arp_responder.conf + vars: + - arp_responder_args: '-e' delegate_to: "{{ ptf_host }}" - name: Reread supervisor configuration @@ -105,6 +113,7 @@ - dut_mac='{{ ansible_Ethernet0['macaddress'] }}' - default_ip_range='192.168.0.0/16' - vlan_ip_range=\"{{ minigraph_vlan_interfaces[0]['subnet'] }}\" + - arista_vms=\"{{ vm_hosts }}\" always: - name: Remove existing ip from ptf host diff --git a/ansible/roles/test/tasks/fdb.yml b/ansible/roles/test/tasks/fdb.yml index 7b8147535c4..b6fec757057 100644 --- a/ansible/roles/test/tasks/fdb.yml +++ b/ansible/roles/test/tasks/fdb.yml @@ -30,6 +30,12 @@ copy: src=roles/test/files/helpers/arp_responder.py dest=/opt delegate_to: "{{ptf_host}}" +- name: Copy arp responder supervisor configuration to the PTF container + template: src=arp_responder.conf.j2 dest=/etc/supervisor/conf.d/arp_responder.conf + vars: + - arp_responder_args: '' + delegate_to: "{{ ptf_host }}" + - name: Copy ARP responder supervisor configuration to PTF copy: src=roles/test/files/supervisor/arp_responder.conf dest=/etc/supervisor/conf.d delegate_to: "{{ptf_host}}" diff --git a/ansible/roles/test/files/supervisor/arp_responder.conf b/ansible/roles/test/templates/arp_responder.conf.j2 similarity index 75% rename from ansible/roles/test/files/supervisor/arp_responder.conf rename to ansible/roles/test/templates/arp_responder.conf.j2 index ce265452f56..7d6dcb3062d 100644 --- a/ansible/roles/test/files/supervisor/arp_responder.conf +++ b/ansible/roles/test/templates/arp_responder.conf.j2 @@ -1,5 +1,5 @@ [program:arp_responder] -command=/usr/bin/python /opt/arp_responder.py +command=/usr/bin/python /opt/arp_responder.py {{ arp_responder_args }} process_name=arp_responder stdout_logfile=/tmp/arp_responder.out.log stderr_logfile=/tmp/arp_responder.err.log