From 7613d3cf8ed7b8cf2b2129fed244b0535fe4af71 Mon Sep 17 00:00:00 2001 From: Saikrishna Arcot Date: Tue, 9 Jan 2024 16:50:11 -0800 Subject: [PATCH 1/2] Add teamd retry count script to 202205 image This backports a part of #3069 to allow upgrading from 202205 to 202305 with the teamd retry count feature. Signed-off-by: Saikrishna Arcot --- scripts/fast-reboot | 30 +++ scripts/teamd_increase_retry_count.py | 330 ++++++++++++++++++++++++++ 2 files changed, 360 insertions(+) create mode 100755 scripts/teamd_increase_retry_count.py diff --git a/scripts/fast-reboot b/scripts/fast-reboot index b89592e27f..30b0c787d6 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -30,6 +30,7 @@ TAG_LATEST=yes DETACH=no LOG_PATH="/var/log/${REBOOT_TYPE}.txt" UIMAGE_HDR_SIZE=64 +REQUIRE_TEAMD_RETRY_COUNT=no # Require 100M available on the hard drive for warm reboot temp files, # Size is in 1K blocks: @@ -47,6 +48,7 @@ EXIT_DB_INTEGRITY_FAILURE=15 EXIT_NO_CONTROL_PLANE_ASSISTANT=20 EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21 EXIT_PLATFORM_FW_AU_FAILURE=22 +EXIT_TEAMD_RETRY_COUNT_FAILURE=23 function error() { @@ -77,6 +79,8 @@ function showHelpAndExit() echo " - control plane assistant IP list." echo " -t : Don't tag the current kube images as latest" echo " -D : detached mode - closing terminal will not cause stopping reboot" + echo " -n : don't require peer devices to be running SONiC with retry count feature [default]" + echo " -N : require peer devices to be running SONiC with retry count feature" exit "${EXIT_SUCCESS}" } @@ -121,6 +125,12 @@ function parseOptions() D ) DETACH=yes ;; + n ) + REQUIRE_TEAMD_RETRY_COUNT=no + ;; + N ) + REQUIRE_TEAMD_RETRY_COUNT=yes + ;; esac done } @@ -605,6 +615,22 @@ init_warm_reboot_states setup_control_plane_assistant +TEAMD_INCREASE_RETRY_COUNT=0 +if [[ "${REBOOT_TYPE}" = "warm-reboot" || "${REBOOT_TYPE}" = "fastfast-reboot" ]]; then + TEAMD_RETRY_COUNT_PROBE_RC=0 + /usr/local/bin/teamd_increase_retry_count.py --probe-only || TEAMD_RETRY_COUNT_PROBE_RC=$? + if [[ ${TEAMD_RETRY_COUNT_PROBE_RC} -ne 0 ]]; then + if [[ "${REQUIRE_TEAMD_RETRY_COUNT}" = "yes" ]]; then + error "Could not confirm that all neighbor devices are running SONiC with the retry count feature" + exit "${EXIT_TEAMD_RETRY_COUNT_FAILURE}" + else + debug "Warning: Retry count feature support unknown for one or more neighbor devices; assuming that it's not available" + fi + else + TEAMD_INCREASE_RETRY_COUNT=1 + fi +fi + if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then # Freeze orchagent for warm restart # Ask orchagent_restart_check to try freeze 5 times with interval of 2 seconds, @@ -633,6 +659,10 @@ if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then fi fi +if [[ ( "${REBOOT_TYPE}" = "warm-reboot" || "${REBOOT_TYPE}" = "fastfast-reboot" ) && "${TEAMD_INCREASE_RETRY_COUNT}" -eq 1 ]]; then + /usr/local/bin/teamd_increase_retry_count.py +fi + # We are fully committed to reboot from this point on because critical # service will go down and we cannot recover from it. set +e diff --git a/scripts/teamd_increase_retry_count.py b/scripts/teamd_increase_retry_count.py new file mode 100755 index 0000000000..d5151b69b9 --- /dev/null +++ b/scripts/teamd_increase_retry_count.py @@ -0,0 +1,330 @@ +#!/usr/bin/python3 + +import subprocess +import json +from scapy.config import conf +conf.ipv6_enabled = False +conf.verb = False +from scapy.fields import ByteField, ShortField, MACField, XStrFixedLenField, ConditionalField +from scapy.layers.l2 import Ether +from scapy.sendrecv import sendp, sniff +from scapy.packet import Packet, split_layers, bind_layers +import scapy.contrib.lacp +import os +import re +import sys +from threading import Thread, Event +import time +import argparse +import signal + +from sonic_py_common import logger +from swsscommon.swsscommon import DBConnector, Table + +log = logger.Logger() +revertTeamdRetryCountChanges = False +DEFAULT_RETRY_COUNT = 3 +EXTENDED_RETRY_COUNT = 5 +SLOW_PROTOCOL_MAC_ADDRESS = "01:80:c2:00:00:02" +LACP_ETHERTYPE = 0x8809 + +class LACPRetryCount(Packet): + name = "LACPRetryCount" + fields_desc = [ + ByteField("version", 0xf1), + ByteField("actor_type", 1), + ByteField("actor_length", 20), + ShortField("actor_system_priority", 0), + MACField("actor_system", None), + ShortField("actor_key", 0), + ShortField("actor_port_priority", 0), + ShortField("actor_port_number", 0), + ByteField("actor_state", 0), + XStrFixedLenField("actor_reserved", "", 3), + ByteField("partner_type", 2), + ByteField("partner_length", 20), + ShortField("partner_system_priority", 0), + MACField("partner_system", None), + ShortField("partner_key", 0), + ShortField("partner_port_priority", 0), + ShortField("partner_port_number", 0), + ByteField("partner_state", 0), + XStrFixedLenField("partner_reserved", "", 3), + ByteField("collector_type", 3), + ByteField("collector_length", 16), + ShortField("collector_max_delay", 0), + XStrFixedLenField("collector_reserved", "", 12), + ConditionalField(ByteField("actor_retry_count_type", 0x80), lambda pkt:pkt.version == 0xf1), + ConditionalField(ByteField("actor_retry_count_length", 4), lambda pkt:pkt.version == 0xf1), + ConditionalField(ByteField("actor_retry_count", 0), lambda pkt:pkt.version == 0xf1), + ConditionalField(XStrFixedLenField("actor_retry_count_reserved", "", 1), lambda pkt:pkt.version == 0xf1), + ConditionalField(ByteField("partner_retry_count_type", 0x81), lambda pkt:pkt.version == 0xf1), + ConditionalField(ByteField("partner_retry_count_length", 4), lambda pkt:pkt.version == 0xf1), + ConditionalField(ByteField("partner_retry_count", 0), lambda pkt:pkt.version == 0xf1), + ConditionalField(XStrFixedLenField("partner_retry_count_reserved", "", 1), lambda pkt:pkt.version == 0xf1), + ByteField("terminator_type", 0), + ByteField("terminator_length", 0), + ConditionalField(XStrFixedLenField("reserved", "", 42), lambda pkt:pkt.version == 0xf1), + ConditionalField(XStrFixedLenField("reserved", "", 50), lambda pkt:pkt.version != 0xf1), + ] + +split_layers(scapy.contrib.lacp.SlowProtocol, scapy.contrib.lacp.LACP, subtype=1) +bind_layers(scapy.contrib.lacp.SlowProtocol, LACPRetryCount, subtype=1) + +class LacpPacketListenThread(Thread): + def __init__(self, port, targetMacAddress, sendReadyEvent): + Thread.__init__(self) + self.port = port + self.targetMacAddress = targetMacAddress + self.sendReadyEvent = sendReadyEvent + self.detectedNewVersion = False + + def lacpPacketCallback(self, pkt): + if pkt["LACPRetryCount"].version == 0xf1: + self.detectedNewVersion = True + return self.detectedNewVersion + + def run(self): + sniff(stop_filter=self.lacpPacketCallback, iface=self.port, filter="ether proto {} and ether src {}".format(LACP_ETHERTYPE, self.targetMacAddress), + store=0, timeout=30, started_callback=self.sendReadyEvent.set) + +def getPortChannels(): + applDb = DBConnector("APPL_DB", 0) + configDb = DBConnector("CONFIG_DB", 0) + portChannelTable = Table(applDb, "LAG_TABLE") + portChannels = portChannelTable.getKeys() + activePortChannels = [] + for portChannel in portChannels: + state = portChannelTable.get(portChannel) + if not state or not state[0]: + continue + isAdminUp = False + isOperUp = False + for key, value in state[1]: + if key == "admin_status": + isAdminUp = value == "up" + elif key == "oper_status": + isOperUp = value == "up" + if isAdminUp and isOperUp: + activePortChannels.append(portChannel) + + # Now find out which BGP sessions on these port channels are admin up. This needs to go + # through a circuitious sequence of steps. + # + # 1. Get the local IPv4/IPv6 address assigned to each port channel. + # 2. Find out which BGP session (in CONFIG_DB) has a local_addr attribute of the local + # IPv4/IPv6 address. + # 3. Check the admin_status field of that table in CONFIG_DB. + portChannelData = {} + portChannelInterfaceTable = Table(configDb, "PORTCHANNEL_INTERFACE") + portChannelInterfaces = portChannelInterfaceTable.getKeys() + for portChannelInterface in portChannelInterfaces: + if "|" not in portChannelInterface: + continue + portChannel = portChannelInterface.split("|")[0] + ipAddress = portChannelInterface.split("|")[1].split("/")[0].lower() + if portChannel not in activePortChannels: + continue + portChannelData[ipAddress] = { + "portChannel": portChannel, + "adminUp": False + } + + deviceMetadataTable = Table(configDb, "DEVICE_METADATA") + metadata = deviceMetadataTable.get("localhost") + defaultBgpStatus = True + for key, value in metadata[1]: + if key == "default_bgp_status": + defaultBgpStatus = value == "up" + break + + bgpTable = Table(configDb, "BGP_NEIGHBOR") + bgpNeighbors = bgpTable.getKeys() + for bgpNeighbor in bgpNeighbors: + neighborData = bgpTable.get(bgpNeighbor) + if not neighborData[0]: + continue + localAddr = None + isAdminUp = defaultBgpStatus + for key, value in neighborData[1]: + if key == "local_addr": + if value not in portChannelData: + break + localAddr = value.lower() + elif key == "admin_status": + isAdminUp = value == "up" + if not localAddr: + continue + portChannelData[localAddr]["adminUp"] = isAdminUp + + return set([portChannelData[x]["portChannel"] for x in portChannelData.keys() if portChannelData[x]["adminUp"]]) + +def getPortChannelConfig(portChannelName): + (processStdout, _) = getCmdOutput(["teamdctl", portChannelName, "state", "dump"]) + return json.loads(processStdout) + +def getLldpNeighbors(): + (processStdout, _) = getCmdOutput(["lldpctl", "-f", "json"]) + return json.loads(processStdout) + +def craftLacpPacket(portChannelConfig, portName, isResetPacket=False, newVersion=True): + portConfig = portChannelConfig["ports"][portName] + actorConfig = portConfig["runner"]["actor_lacpdu_info"] + partnerConfig = portConfig["runner"]["partner_lacpdu_info"] + l2 = Ether(dst=SLOW_PROTOCOL_MAC_ADDRESS, src=portConfig["ifinfo"]["dev_addr"], type=LACP_ETHERTYPE) + l3 = scapy.contrib.lacp.SlowProtocol(subtype=0x01) + l4 = LACPRetryCount() + if newVersion: + l4.version = 0xf1 + else: + l4.version = 0x1 + l4.actor_system_priority = actorConfig["system_priority"] + l4.actor_system = actorConfig["system"] + l4.actor_key = actorConfig["key"] + l4.actor_port_priority = actorConfig["port_priority"] + l4.actor_port_number = actorConfig["port"] + l4.actor_state = actorConfig["state"] + l4.partner_system_priority = partnerConfig["system_priority"] + l4.partner_system = partnerConfig["system"] + l4.partner_key = partnerConfig["key"] + l4.partner_port_priority = partnerConfig["port_priority"] + l4.partner_port_number = partnerConfig["port"] + l4.partner_state = partnerConfig["state"] + if newVersion: + l4.actor_retry_count = EXTENDED_RETRY_COUNT if not isResetPacket else DEFAULT_RETRY_COUNT + l4.partner_retry_count = DEFAULT_RETRY_COUNT + packet = l2 / l3 / l4 + return packet + +def sendLacpPackets(packets, revertPackets): + global revertTeamdRetryCountChanges + while not revertTeamdRetryCountChanges: + for port, packet in packets: + sendp(packet, iface=port) + time.sleep(15) + if revertTeamdRetryCountChanges: + for port, packet in revertPackets: + sendp(packet, iface=port) + +def abortTeamdChanges(signum, frame): + global revertTeamdRetryCountChanges + log.log_info("Got signal {}, reverting teamd retry count change".format(signum)) + revertTeamdRetryCountChanges = True + +def getCmdOutput(cmd): + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) + return proc.communicate()[0], proc.returncode + +def main(probeOnly=False): + if os.geteuid() != 0: + log.log_error("Root privileges required for this operation", also_print_to_console=True) + sys.exit(1) + portChannels = getPortChannels() + if not portChannels: + log.log_info("No port channels retrieved; exiting") + return + failedPortChannels = [] + if probeOnly: + for portChannel in portChannels: + config = getPortChannelConfig(portChannel) + lldpInfo = getLldpNeighbors() + portChannelChecked = False + for portName in config["ports"].keys(): + if not "runner" in config["ports"][portName] or \ + not "partner_lacpdu_info" in config["ports"][portName]["runner"] or \ + not "actor_lacpdu_info" in config["ports"][portName]["runner"]: + log.log_error("ERROR: Missing information from teamd about {}; skipping".format(portName)) + failedPortChannels.append(portChannel) + break + + interfaceLldpInfo = [k for k in lldpInfo["lldp"]["interface"] if portName in k] + if not interfaceLldpInfo: + log.log_warning("WARNING: No LLDP info available for {}; skipping".format(portName)) + continue + interfaceLldpInfo = interfaceLldpInfo[0][portName] + peerName = list(interfaceLldpInfo["chassis"].keys())[0] + peerInfo = interfaceLldpInfo["chassis"][peerName] + if "descr" not in peerInfo: + log.log_warning("WARNING: No peer description available via LLDP for {}; skipping".format(portName)) + continue + portChannelChecked = True + if "sonic" not in peerInfo["descr"].lower(): + log.log_warning("WARNING: Peer device is not a SONiC device; skipping") + failedPortChannels.append(portChannel) + break + + sendReadyEvent = Event() + + # Start sniffing thread + lacpThread = LacpPacketListenThread(portName, config["ports"][portName]["runner"]["partner_lacpdu_info"]["system"], sendReadyEvent) + lacpThread.start() + + # Generate and send probe packet after sniffing has started + probePacket = craftLacpPacket(config, portName) + sendReadyEvent.wait() + sendp(probePacket, iface=portName) + + lacpThread.join() + + resetProbePacket = craftLacpPacket(config, portName, newVersion=False) + # 2-second sleep for making sure all processing is done on the peer device + time.sleep(2) + sendp(resetProbePacket, iface=portName, count=2, inter=0.5) + + if lacpThread.detectedNewVersion: + log.log_notice("SUCCESS: Peer device {} is running version of SONiC with teamd retry count feature".format(peerName), also_print_to_console=True) + break + else: + log.log_warning("WARNING: Peer device {} is running version of SONiC without teamd retry count feature".format(peerName), also_print_to_console=True) + failedPortChannels.append(portChannel) + break + if not portChannelChecked: + log.log_warning("WARNING: No information available about peer device on port channel {}".format(portChannel), also_print_to_console=True) + failedPortChannels.append(portChannel) + if failedPortChannels: + log.log_error("ERROR: There are port channels/peer devices that failed the probe: {}".format(failedPortChannels), also_print_to_console=True) + sys.exit(2) + else: + global revertTeamdRetryCountChanges + signal.signal(signal.SIGUSR1, abortTeamdChanges) + signal.signal(signal.SIGTERM, abortTeamdChanges) + (_, rc) = getCmdOutput(["config", "portchannel", "retry-count", "get", list(portChannels)[0]]) + if rc == 0: + # Currently running on SONiC version with teamd retry count feature + for portChannel in portChannels: + getCmdOutput(["config", "portchannel", "retry-count", "set", portChannel, str(EXTENDED_RETRY_COUNT)]) + pid = os.fork() + if pid == 0: + # Running in a new process, detached from parent process + while not revertTeamdRetryCountChanges: + time.sleep(15) + if revertTeamdRetryCountChanges: + for portChannel in portChannels: + getCmdOutput(["config", "portchannel", "retry-count", "set", portChannel, str(DEFAULT_RETRY_COUNT)]) + else: + lacpPackets = [] + revertLacpPackets = [] + for portChannel in portChannels: + config = getPortChannelConfig(portChannel) + for portName in config["ports"].keys(): + if not "runner" in config["ports"][portName] or \ + not "partner_lacpdu_info" in config["ports"][portName]["runner"] or \ + not "actor_lacpdu_info" in config["ports"][portName]["runner"]: + log.log_error("ERROR: Missing information from teamd about {}; skipping".format(portName)) + break + + packet = craftLacpPacket(config, portName) + lacpPackets.append((portName, packet)) + packet = craftLacpPacket(config, portName, isResetPacket=True) + revertLacpPackets.append((portName, packet)) + pid = os.fork() + if pid == 0: + # Running in a new process, detached from parent process + sendLacpPackets(lacpPackets, revertLacpPackets) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Teamd retry count changer.') + parser.add_argument('--probe-only', action='store_true', + help='Probe the peer devices only, to verify that they support the teamd retry count feature') + args = parser.parse_args() + main(args.probe_only) From 43a7c4ae287cf61988df458f5650144d43ce121e Mon Sep 17 00:00:00 2001 From: Saikrishna Arcot Date: Tue, 16 Jan 2024 09:01:32 -0800 Subject: [PATCH 2/2] Actually install teamd_increase_retry_count.py Signed-off-by: Saikrishna Arcot --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 028c42e82c..8dac8fa135 100644 --- a/setup.py +++ b/setup.py @@ -139,6 +139,7 @@ 'scripts/soft-reboot', 'scripts/storyteller', 'scripts/syseeprom-to-json', + 'scripts/teamd_increase_retry_count.py', 'scripts/tempershow', 'scripts/tunnelstat', 'scripts/update_json.py',