Skip to content

Commit a25aaeb

Browse files
authored
[202305] Fix LAG downtime calculation with SONiC neighbors (sonic-net#11243)
* For SONiC neighbors, in neigh_lag_status_check, use existing info in cli_info In the neigh_lag_status_check function, when the neighbor device is checked to see if there were any LAG flaps, instead of trying to get this information from the neighbor VM, just use the existing info we have in self.cli_info already. Otherwise, LAG flaps that may have happened prior to the test starting would show up here. Signed-off-by: Saikrishna Arcot <sarcot@microsoft.com>
1 parent 3ddaa1d commit a25aaeb

2 files changed

Lines changed: 48 additions & 13 deletions

File tree

ansible/roles/test/files/ptftests/advanced-reboot.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,14 +1371,18 @@ def neigh_lag_status_check(self):
13711371
Ensure there are no interface flaps after warm-boot
13721372
"""
13731373
for neigh in self.ssh_targets:
1374-
self.test_params['port_channel_intf_idx'] = [x['ptf_ports'][0] for x in self.vm_dut_map.values()
1375-
if x['mgmt_addr'] == neigh]
1376-
self.neigh_handle = HostDevice.getHostDeviceInstance(self.test_params['neighbor_type'], neigh,
1377-
None, self.test_params)
1378-
self.neigh_handle.connect()
1379-
fails, flap_cnt = self.neigh_handle.verify_neigh_lag_no_flap()
1380-
self.neigh_handle.disconnect()
1381-
self.fails[neigh] |= fails
1374+
flap_cnt = None
1375+
if self.test_params['neighbor_type'] == "sonic":
1376+
flap_cnt = self.cli_info[neigh][1]
1377+
else:
1378+
self.test_params['port_channel_intf_idx'] = [x['ptf_ports'][0] for x in self.vm_dut_map.values()
1379+
if x['mgmt_addr'] == neigh]
1380+
self.neigh_handle = HostDevice.getHostDeviceInstance(self.test_params['neighbor_type'], neigh,
1381+
None, self.test_params)
1382+
self.neigh_handle.connect()
1383+
fails, flap_cnt = self.neigh_handle.verify_neigh_lag_no_flap()
1384+
self.neigh_handle.disconnect()
1385+
self.fails[neigh] |= fails
13821386
if not flap_cnt:
13831387
self.log("No LAG flaps seen on %s after warm boot" % neigh)
13841388
else:
@@ -1579,7 +1583,24 @@ def peer_state_check(self, ip, queue):
15791583
if x['mgmt_addr'] == ip]
15801584
ssh = HostDevice.getHostDeviceInstance(self.test_params['neighbor_type'], ip, queue,
15811585
self.test_params, log_cb=self.log)
1582-
self.fails[ip], self.info[ip], self.cli_info[ip], self.logs_info[ip], self.lacp_pdu_times[ip] = ssh.run()
1586+
try:
1587+
self.fails[ip], self.info[ip], self.cli_info[ip], self.logs_info[ip], self.lacp_pdu_times[ip] = ssh.run()
1588+
except Exception:
1589+
traceback_msg = traceback.format_exc()
1590+
self.log("Error in HostDevice: {}".format(traceback_msg))
1591+
self.fails[ip] = set()
1592+
self.fails[ip].add("HostDevice hit an exception")
1593+
self.info[ip] = set()
1594+
self.cli_info[ip] = {
1595+
"lacp": [0, 0],
1596+
"po": [0, 0],
1597+
"bgp_v4": [0, 0],
1598+
"bgp_v6": [0, 0],
1599+
}
1600+
self.logs_info[ip] = {}
1601+
self.lacp_pdu_times[ip] = {
1602+
"lacp_all": []
1603+
}
15831604
self.log('SSH thread for VM {} finished'.format(ip))
15841605

15851606
lacp_pdu_times = self.lacp_pdu_times[ip]

ansible/roles/test/files/ptftests/sonic.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,19 @@ def do_cmd(self, cmd):
6161
attempts += 1
6262
try:
6363
stdin, stdout, stderr = self.conn.exec_command(cmd, timeout=Sonic.SSH_CMD_TIMEOUT)
64-
return stdout.read()
64+
# Warning: This is a bit fragile. Both stdout and stderr use
65+
# the same SSH channel, and if the buffer for that channel is
66+
# full, then either stdout or stderr will block until there's
67+
# space in the channel.
68+
#
69+
# Therefore, fully read stdout first before trying to read
70+
# stderr. This assumes there's plenty of data on stdout to read,
71+
# but not much on stderr.
72+
stdoutOutput = stdout.read()
73+
stderrOutput = stderr.read()
74+
if len(stderrOutput) > 0:
75+
self.log("Output on stderr from command '{}': '{}'".format(cmd, stderrOutput))
76+
return stdoutOutput
6577
except socket.timeout:
6678
self.log("Timeout when running command: {}".format(cmd))
6779
return ""
@@ -175,12 +187,12 @@ def run(self):
175187
self.disconnect()
176188

177189
# save data for troubleshooting
178-
with open("/tmp/%s.data.pickle" % self.ip, "w") as fp:
190+
with open("/tmp/%s.data.pickle" % self.ip, "wb") as fp:
179191
pickle.dump(data, fp)
180192

181193
# save debug data for troubleshooting
182194
if self.DEBUG:
183-
with open("/tmp/%s.raw.pickle" % self.ip, "w") as fp:
195+
with open("/tmp/%s.raw.pickle" % self.ip, "wb") as fp:
184196
pickle.dump(debug_data, fp)
185197
with open("/tmp/%s.logging" % self.ip, "w") as fp:
186198
fp.write("\n".join(log_lines))
@@ -377,7 +389,7 @@ def parse_bgp_route(self, output, expects):
377389
return set(expects).issubset(prefixes)
378390

379391
def parse_supported_show_lacp_command(self):
380-
show_lacp_command = "show lacp neighbor"
392+
show_lacp_command = "show interface portchannel"
381393
self.log("show lacp command is '{}'".format(show_lacp_command))
382394
return show_lacp_command
383395

@@ -481,6 +493,8 @@ def verify_neigh_lag_no_flap(self):
481493
# Note: this function may have false-positives (with regards to link flaps). The start time used here is
482494
# the system's boot time, not the test start time, which means any LAG flaps before the start of the test
483495
# would get included here.
496+
#
497+
# You probably really don't want to call this function.
484498
log_lines = self.do_cmd("sudo cat /var/log/teamd.log{,.1}").split('\n')
485499
boot_time = datetime.datetime.strptime(self.do_cmd("uptime -s").strip(), "%Y-%m-%d %H:%M:%S")
486500
_, flap_cnt = self.check_lag_flaps("PortChannel1", log_lines, time.mktime(boot_time.timetuple()))

0 commit comments

Comments
 (0)