From 5af0363042f8128f313e37261958fb19e043c65a Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Wed, 15 Nov 2023 16:07:02 +0800 Subject: [PATCH 1/7] recover testbed(critical path) --- .azure-pipelines/recover_testbed/common.py | 227 ++++++++++++++++++ .../recover_testbed/dut_connection.py | 146 +++++++++++ .../recover_testbed/recover_testbed.py | 171 +++++++++++++ .../recover_testbed/testbed_status.py | 25 ++ 4 files changed, 569 insertions(+) create mode 100644 .azure-pipelines/recover_testbed/common.py create mode 100644 .azure-pipelines/recover_testbed/dut_connection.py create mode 100644 .azure-pipelines/recover_testbed/recover_testbed.py create mode 100644 .azure-pipelines/recover_testbed/testbed_status.py diff --git a/.azure-pipelines/recover_testbed/common.py b/.azure-pipelines/recover_testbed/common.py new file mode 100644 index 00000000000..57cbd28972d --- /dev/null +++ b/.azure-pipelines/recover_testbed/common.py @@ -0,0 +1,227 @@ +import os +import sys +import logging +import termios +import tty +import select +import socket +import time +import pexpect +import ipaddress + +_self_dir = os.path.dirname(os.path.abspath(__file__)) +base_path = os.path.realpath(os.path.join(_self_dir, "../..")) +if base_path not in sys.path: + sys.path.append(base_path) +ansible_path = os.path.realpath(os.path.join(_self_dir, "../../ansible")) +if ansible_path not in sys.path: + sys.path.append(ansible_path) + +from tests.common.plugins.pdu_controller.pdu_manager import pdu_manager_factory + +logger = logging.getLogger(__name__) + +ADD_MANAGEMENT_IP = "ifconfig eth0 {} netmask 255.255.254.0" +ADD_DEFAULT_IP_ROUTE = "ip route add default via {}" +INSTALL_SONIC_IMAGE = "onie-nos-install {}" # noqa E501 + + +def redeploy_testbed(sonichost, testbed_name): + logging.info("YT test redeploy testbed") + + +def get_pdu_managers(sonichosts, conn_graph_facts): + """Get PDU managers for all the devices to be upgraded. + + Args: + sonichosts (SonicHosts): Instance of class SonicHosts + conn_graph_facts (dict): Connection graph dict. + + Returns: + dict: A dict of PDU managers. Key is device hostname. Value is the PDU manager object for the device. + """ + pdu_managers = {} + device_pdu_links = conn_graph_facts['device_pdu_links'] + device_pdu_info = conn_graph_facts['device_pdu_info'] + for hostname in sonichosts.hostnames: + pdu_links = device_pdu_links[hostname] + pdu_info = device_pdu_info[hostname] + pdu_vars = {} + for pdu_name in pdu_info.keys(): + pdu_vars[pdu_name] = sonichosts.get_host_visible_vars(pdu_name) + + pdu_managers[hostname] = pdu_manager_factory(hostname, pdu_links, pdu_info, pdu_vars) + return pdu_managers + + +def posix_shell(dut_console, dutip, image_url): + oldtty = termios.tcgetattr(sys.stdin) + enter_onie_flag = 0 + # install_image_flag = False + gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] + try: + tty.setraw(sys.stdin.fileno()) + tty.setcbreak(sys.stdin.fileno()) + dut_console.remote_conn.settimeout(0.0) + + while True: + r, w, e = select.select([dut_console.remote_conn, sys.stdin], [], []) + if dut_console.remote_conn in r: + try: + x = dut_console.remote_conn.recv(65536) + if len(x) == 0: + sys.stdout.write("\r\n*** EOF\r\n") + break + + x = x.decode('ISO-8859-9') + + if "GNU GRUB" in x: + enter_onie_flag += 1 + continue + + if "SONiC-OS-" in x and enter_onie_flag == 1: + # Send arrow key "down" here. + dut_console.remote_conn.send(b'\x1b[B') + continue + + if "*ONIE" in x and "Install OS" not in x: + dut_console.remote_conn.send("\n") + enter_onie_flag += 1 + + if "ONIE: Starting ONIE Service Discovery" in x: + for i in range(5): + dut_console.remote_conn.send('onie-discovery-stop\n') + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send(ADD_MANAGEMENT_IP.format(dutip)) + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send(ADD_DEFAULT_IP_ROUTE.format(gw_ip)) + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send(INSTALL_SONIC_IMAGE.format(image_url)) + dut_console.remote_conn.send("\n") + # why sleep + time.sleep(60) + x = dut_console.remote_conn.recv(1024) + x = x.decode('ISO-8859-9') + # sample output + if "ETA" in x: + break + + if "sonic login:" in x: + dut_console.remote_conn.close() + + sys.stdout.write(x) + sys.stdout.flush() + except socket.timeout: + pass + if sys.stdin in r: + x = sys.stdin.read(1) + if len(x) == 0: + break + dut_console.remote_conn.send(x) + + finally: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, oldtty) + +def posix_shell_aboot(dut_console, dutip, image_url): + oldtty = termios.tcgetattr(sys.stdin) + install_image_flag = True + gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] + try: + tty.setraw(sys.stdin.fileno()) + tty.setcbreak(sys.stdin.fileno()) + dut_console.remote_conn.settimeout(0.0) + + while True: + r, w, e = select.select([dut_console.remote_conn, sys.stdin], [], []) + if dut_console.remote_conn in r: + try: + x = dut_console.remote_conn.recv(65536) + if len(x) == 0: + sys.stdout.write("\r\n*** EOF\r\n") + break + + x = x.decode('ISO-8859-9') + + if install_image_flag: + # "Press Control-C now to enter Aboot shell" + if "Press" in x: + dut_console.remote_conn.send("\x03") + continue + + if "Aboot" in x and "#" in x: + dut_console.remote_conn.send("ifconfig ma1 {} netmask 255.255.254.0".format(dutip)) + dut_console.remote_conn.send("\n") + + time.sleep(1) + + dut_console.remote_conn.send("route add default gw {}".format(gw_ip)) + dut_console.remote_conn.send("\n") + + time.sleep(1) + + dut_console.remote_conn.send("ip route add default via {} dev ma1".format(gw_ip)) + dut_console.remote_conn.send("\n") + + time.sleep(1) + + dut_console.remote_conn.send("wget {}".format(image_url)) + dut_console.remote_conn.send("\n") + + for i in range(5): + time.sleep(10) + x = dut_console.remote_conn.recv(1024) + x = x.decode('ISO-8859-9') + if "ETA" in x: + break + + dut_console.remote_conn.send("echo 'SWI=flash:{}' > boot-config".format(image_url.split("/")[-1])) + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send("reboot") + dut_console.remote_conn.send("\n") + + install_image_flag = False + + if "login:" in x: + dut_console.remote_conn.close() + + sys.stdout.write(x) + sys.stdout.flush() + except socket.timeout: + pass + if sys.stdin in r: + x = sys.stdin.read(1) + if len(x) == 0: + break + dut_console.remote_conn.send(x) + + finally: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, oldtty) + + +def do_power_cycle(sonichost, conn_graph_facts, localhost): + pdu_managers = get_pdu_managers(sonichost, conn_graph_facts) + + for hostname, pdu_manager in pdu_managers.items(): + logger.info("Turn off power outlets to {}".format(hostname)) + pdu_manager.turn_off_outlet() + localhost.pause(seconds=30, prompt="Pause between power off/on") + + for hostname, pdu_manager in pdu_managers.items(): + logger.info("Turn on power outlets to {}".format(hostname)) + pdu_manager.turn_on_outlet() + + +def check_sonic_installer(sonichost, sonic_username, sonic_password, sonic_ip, image_url): + client = pexpect.spawn('ssh {}@{} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'.format(sonic_username, sonic_ip)) # noqa E501 + client.expect("admin@{}'s password:".format(sonic_ip)) + client.sendline(sonic_password) + client.expect(["admin@sonic", "admin@{}".format(sonichost.hostname)]) + client.sendline("sudo sonic-installer install {}".format(image_url)) # noqa E501 + client.expect("New image will be installed") + client.close() + # client.sendline("y") + # client.expect("Downloading image...") diff --git a/.azure-pipelines/recover_testbed/dut_connection.py b/.azure-pipelines/recover_testbed/dut_connection.py new file mode 100644 index 00000000000..2fefdb63981 --- /dev/null +++ b/.azure-pipelines/recover_testbed/dut_connection.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import logging +import os +import sys +import paramiko +import socket +import glob +import re +import yaml +import jinja2 +from tests.common.connections.console_host import ConsoleHost +from paramiko.ssh_exception import AuthenticationException + +_self_dir = os.path.dirname(os.path.abspath(__file__)) +base_path = os.path.realpath(os.path.join(_self_dir, "../..")) +if base_path not in sys.path: + sys.path.append(base_path) +ansible_path = os.path.realpath(os.path.join(_self_dir, "../../ansible")) +if ansible_path not in sys.path: + sys.path.append(ansible_path) + +logger = logging.getLogger(__name__) + +RC_SSH_SUCCESS = 0 +RC_SOCKET_TIMEOUT = 1 +RC_PASSWORD_FAILED = 2 + + +def creds_on_dut(sonichost): + groups = sonichost.im.get_host(sonichost.hostname).get_vars()['group_names'] + groups.append("fanout") + logger.info("dut {} belongs to groups {}".format(sonichost.hostname, groups)) + exclude_regex_patterns = [ + r'topo_.*\.yml', + r'breakout_speed\.yml', + r'lag_fanout_ports_test_vars\.yml', + r'qos\.yml', + r'sku-sensors-data\.yml', + r'mux_simulator_http_port_map\.yml' + ] + files = glob.glob("group_vars/all/*.yml") + files += glob.glob("vars/*.yml") + for group in groups: + files += glob.glob("group_vars/{}/*.yml".format(group)) + filtered_files = [ + f for f in files if not re.search('|'.join(exclude_regex_patterns), f) + ] + + creds = {} + for f in filtered_files: + with open(f) as stream: + v = yaml.safe_load(stream) + if v is not None: + creds.update(v) + else: + logger.info("skip empty var file {}".format(f)) + + cred_vars = [ + "sonicadmin_user", + "sonicadmin_password", + "docker_registry_host", + "docker_registry_username", + "docker_registry_password", + "public_docker_registry_host" + ] + + hostvars = sonichost.vm.get_vars( + host=sonichost.im.get_hosts(pattern='sonic')[0]) + + for cred_var in cred_vars: + if cred_var in creds: + creds[cred_var] = jinja2.Template(creds[cred_var]).render(**hostvars) + + if "console_login" not in list(hostvars.keys()): + console_login_creds = {} + else: + console_login_creds = hostvars["console_login"] + + creds["console_user"] = {} + creds["console_password"] = {} + for k, v in list(console_login_creds.items()): + creds["console_user"][k] = v["user"] + creds["console_password"][k] = v["passwd"] + return creds + + +def get_console_info(sonichost, conn_graph_facts): + console_host = conn_graph_facts['device_console_info'][sonichost.hostname]['ManagementIp'] + console_port = conn_graph_facts['device_console_link'][sonichost.hostname]['ConsolePort']['peerport'] + console_type = conn_graph_facts['device_console_link'][sonichost.hostname]['ConsolePort']['type'] + console_username = conn_graph_facts['device_console_link'][sonichost.hostname]['ConsolePort']['proxy'] + + return console_host, console_port, console_type, console_username + + +def get_ssh_info(sonichost): + creds = creds_on_dut(sonichost) + sonic_username = creds['sonicadmin_user'] + sonicadmin_alt_password = sonichost.vm.get_vars( + host=sonichost.im.get_hosts(pattern='sonic')[0]).get("ansible_altpassword") + sonic_password = [creds['sonicadmin_password'], sonicadmin_alt_password] + sonic_ip = sonichost.im.get_host(sonichost.hostname).vars['ansible_host'] + logging.info("sonic username: {}, password: {}".format(sonic_username, sonic_password)) + return sonic_username, sonic_password, sonic_ip + + +def duthost_console(sonichost, conn_graph_facts, localhost): + console_host, console_port, console_type, console_username = get_console_info(sonichost, conn_graph_facts) + console_type = "console_" + console_type + + # console password and sonic_password are lists, which may contain more than one password + sonicadmin_alt_password = sonichost.vm.get_vars( + host=sonichost.im.get_hosts(pattern='sonic')[0]).get("ansible_altpassword") + creds = creds_on_dut(sonichost) + + host = ConsoleHost(console_type=console_type, + console_host=console_host, + console_port=console_port, + sonic_username=creds['sonicadmin_user'], + sonic_password=[creds['sonicadmin_password'], sonicadmin_alt_password], + console_username=console_username, + console_password=creds['console_password'][console_type]) + + return host + + +def duthost_ssh(sonichost): + sonic_username, sonic_passwords, sonic_ip = get_ssh_info(sonichost) + for password in sonic_passwords: + try: + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + ssh.connect(sonic_ip, username=sonic_username, password=password, + allow_agent=False, look_for_keys=False, timeout=10) + _stdin, _stdout, _stderr = ssh.exec_command('show version') + prompt = ssh.invoke_shell() + logging.info("YT test prompt {}".format(prompt.recv(1000).decode())) + ssh.close() + return sonic_username, password, sonic_ip + except AuthenticationException: + continue + except socket.timeout as e: + logger.info("Cannot access DUT {} via ssh, error: {}".format(sonichost.hostname, e)) + return RC_SOCKET_TIMEOUT + return RC_PASSWORD_FAILED diff --git a/.azure-pipelines/recover_testbed/recover_testbed.py b/.azure-pipelines/recover_testbed/recover_testbed.py new file mode 100644 index 00000000000..97894a097bd --- /dev/null +++ b/.azure-pipelines/recover_testbed/recover_testbed.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +import argparse +import logging +import os +import sys +from common import do_power_cycle, check_sonic_installer, posix_shell_aboot + +_self_dir = os.path.dirname(os.path.abspath(__file__)) +base_path = os.path.realpath(os.path.join(_self_dir, "../..")) +if base_path not in sys.path: + sys.path.append(base_path) +ansible_path = os.path.realpath(os.path.join(_self_dir, "../../ansible")) +if ansible_path not in sys.path: + sys.path.append(ansible_path) + +from devutil.devices.factory import init_localhost, init_testbed_sonichosts # noqa E402 +from dut_connection import duthost_ssh, duthost_console, get_ssh_info # noqa E402 +from testbed_status import dut_lose_management_ip # noqa F401 + +logger = logging.getLogger(__name__) + +RC_INIT_FAILED = 1 + +""" +This script must be run under folder "sonic-mgmt/ansible" +SSH connection success: check if sonic-installer is usable, if not, do power cycle. + +If console fails, do power cycle +""" + + +def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url): + try: + dut_console = duthost_console(sonichost, conn_graph_facts, localhost) + + do_power_cycle(sonichost, conn_graph_facts, localhost) + posix_shell_aboot(dut_console, sonic_ip, image_url) + + dut_lose_management_ip(sonichost, conn_graph_facts, localhost, sonic_ip) + except Exception as e: + logger.info(e) + return + + +def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url): + # flag: need_to_recover + for sonichost in sonichosts: + sonic_username, sonic_password, sonic_ip = get_ssh_info(sonichost) + while True: # for i in range(3) + dut_ssh = duthost_ssh(sonichost) + + if type(dut_ssh) == tuple: + logger.info("SSH success.") + sonic_username = dut_ssh[0] + sonic_password = dut_ssh[1] + sonic_ip = dut_ssh[2] + + try: + check_sonic_installer(sonichost, sonic_username, sonic_password, sonic_ip, image_url) + return + except Exception as e: # Exception + logger.info("Exception caught while executing cmd. Error message: {}".format(e)) + recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url) + + elif dut_ssh == 1: # RC_SOCKET_TIMEOUT + # Do power cycle + recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url) + else: + logger.info("Authentication failed. Passwords are incorrect.") + return + + +def validate_args(args): + _log_level_map = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL + } + logging.basicConfig( + stream=sys.stdout, + level=_log_level_map[args.log_level], + format="%(asctime)s %(filename)s#%(lineno)d %(levelname)s - %(message)s" + ) + + +def main(args): + logger.info("Validating arguments") + validate_args(args) + + logger.info("Initializing hosts") + localhost = init_localhost(args.inventory, options={"verbosity": args.verbosity}) + sonichosts = init_testbed_sonichosts( + args.inventory, args.testbed_name, testbed_file=args.tbfile, options={"verbosity": args.verbosity} + ) + + if not localhost or not sonichosts: + sys.exit(RC_INIT_FAILED) + + conn_graph_facts = localhost.conn_graph_facts( + hosts=sonichosts.hostnames, + filepath=os.path.join(ansible_path, "files") + )["ansible_facts"] + + recover_testbed(sonichosts, conn_graph_facts, localhost, args.image) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Tool for getting sonic device version.") + + parser.add_argument( + "-i", "--inventory", + dest="inventory", + nargs="+", + help="Ansible inventory file") + + parser.add_argument( + "-t", "--testbed-name", + type=str, + required=True, + dest="testbed_name", + help="Testbed name." + ) + + parser.add_argument( + "--tbfile", + type=str, + dest="tbfile", + default="testbed.yaml", + help="Testbed definition file." + ) + + parser.add_argument( + "-v", "--verbosity", + type=int, + dest="verbosity", + default=2, + help="Log verbosity (0-3)." + ) + + parser.add_argument( + "--log-level", + type=str, + dest="log_level", + choices=["debug", "info", "warning", "error", "critical"], + default="debug", + help="Loglevel" + ) + + parser.add_argument( + "-o", "--output", + type=str, + dest="output", + required=False, + help="Output duts version to the specified file." + ) + + parser.add_argument( + "--image", + type=str, + dest="image", + required=True, + help="The image url" + ) + + args = parser.parse_args() + main(args) diff --git a/.azure-pipelines/recover_testbed/testbed_status.py b/.azure-pipelines/recover_testbed/testbed_status.py new file mode 100644 index 00000000000..f5d8b97a777 --- /dev/null +++ b/.azure-pipelines/recover_testbed/testbed_status.py @@ -0,0 +1,25 @@ +import logging +import ipaddress +from dut_connection import duthost_ssh, duthost_console # noqa E402 + +logger = logging.getLogger(__name__) + + +ADD_MANAGEMENT_IP = "sudo ip addr add {}/23 brd {} dev eth0" +ADD_DEFAULT_IP_ROUTE = "sudo ip route add default via {}" +# add /etc/network/interfaces -- ip, mask, gw + +def dut_lose_management_ip(sonichost, conn_graph_facts, localhost, dutip): + # Recover DUTs + logger.info("=====Recover start=====") + dut_console = duthost_console(sonichost, conn_graph_facts, localhost) + gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] + brd_ip = ipaddress.ip_interface("{}/23".format(dutip)).network.broadcast_address + try: + ret = dut_console.send_command(ADD_MANAGEMENT_IP.format(dutip, brd_ip)) # noqa F841 + dut_console.send_command(ADD_DEFAULT_IP_ROUTE.format(gw_ip)) + except Exception as e: + logging.info(e) + finally: + logger.info("=====Recover finish=====") + dut_console.disconnect() From d7ae0d5f74944c4df1b39fa5172cacf78d950bc0 Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Tue, 26 Dec 2023 16:23:28 +0800 Subject: [PATCH 2/7] Add hwsku --- .azure-pipelines/recover_testbed/common.py | 8 +++--- .../recover_testbed/recover_testbed.py | 27 ++++++++++++++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/.azure-pipelines/recover_testbed/common.py b/.azure-pipelines/recover_testbed/common.py index 57cbd28972d..809428c14c1 100644 --- a/.azure-pipelines/recover_testbed/common.py +++ b/.azure-pipelines/recover_testbed/common.py @@ -17,7 +17,7 @@ if ansible_path not in sys.path: sys.path.append(ansible_path) -from tests.common.plugins.pdu_controller.pdu_manager import pdu_manager_factory +from tests.common.plugins.pdu_controller.pdu_manager import pdu_manager_factory # noqa E402 logger = logging.getLogger(__name__) @@ -54,7 +54,7 @@ def get_pdu_managers(sonichosts, conn_graph_facts): return pdu_managers -def posix_shell(dut_console, dutip, image_url): +def posix_shell_onie(dut_console, dutip, image_url): oldtty = termios.tcgetattr(sys.stdin) enter_onie_flag = 0 # install_image_flag = False @@ -125,6 +125,7 @@ def posix_shell(dut_console, dutip, image_url): finally: termios.tcsetattr(sys.stdin, termios.TCSADRAIN, oldtty) + def posix_shell_aboot(dut_console, dutip, image_url): oldtty = termios.tcgetattr(sys.stdin) install_image_flag = True @@ -177,7 +178,8 @@ def posix_shell_aboot(dut_console, dutip, image_url): if "ETA" in x: break - dut_console.remote_conn.send("echo 'SWI=flash:{}' > boot-config".format(image_url.split("/")[-1])) + dut_console.remote_conn.send("echo 'SWI=flash:{}' > boot-config" + .format(image_url.split("/")[-1])) dut_console.remote_conn.send("\n") dut_console.remote_conn.send("reboot") diff --git a/.azure-pipelines/recover_testbed/recover_testbed.py b/.azure-pipelines/recover_testbed/recover_testbed.py index 97894a097bd..b39b2b4c717 100644 --- a/.azure-pipelines/recover_testbed/recover_testbed.py +++ b/.azure-pipelines/recover_testbed/recover_testbed.py @@ -4,7 +4,7 @@ import logging import os import sys -from common import do_power_cycle, check_sonic_installer, posix_shell_aboot +from common import do_power_cycle, check_sonic_installer, posix_shell_aboot, posix_shell_onie _self_dir = os.path.dirname(os.path.abspath(__file__)) base_path = os.path.realpath(os.path.join(_self_dir, "../..")) @@ -30,12 +30,17 @@ """ -def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url): +def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku): try: dut_console = duthost_console(sonichost, conn_graph_facts, localhost) do_power_cycle(sonichost, conn_graph_facts, localhost) - posix_shell_aboot(dut_console, sonic_ip, image_url) + if hwsku == "Arista": + posix_shell_aboot(dut_console, sonic_ip, image_url) + elif hwsku == "Cisco": + pass + else: + posix_shell_onie(dut_console, sonic_ip, image_url) dut_lose_management_ip(sonichost, conn_graph_facts, localhost, sonic_ip) except Exception as e: @@ -43,7 +48,7 @@ def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_ return -def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url): +def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url, hwsku): # flag: need_to_recover for sonichost in sonichosts: sonic_username, sonic_password, sonic_ip = get_ssh_info(sonichost) @@ -61,11 +66,11 @@ def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url): return except Exception as e: # Exception logger.info("Exception caught while executing cmd. Error message: {}".format(e)) - recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url) + recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku) elif dut_ssh == 1: # RC_SOCKET_TIMEOUT # Do power cycle - recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url) + recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku) else: logger.info("Authentication failed. Passwords are incorrect.") return @@ -104,7 +109,7 @@ def main(args): filepath=os.path.join(ansible_path, "files") )["ansible_facts"] - recover_testbed(sonichosts, conn_graph_facts, localhost, args.image) + recover_testbed(sonichosts, conn_graph_facts, localhost, args.image, args.hwsku) if __name__ == "__main__": @@ -167,5 +172,13 @@ def main(args): help="The image url" ) + parser.add_argument( + "-h", "--hwsku", + type=str, + dest="hwsku", + required=True, + help="Hwsku of DUT" + ) + args = parser.parse_args() main(args) From bf6219a1aaa8cce80a8056d84317a63a9e4d5519 Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Wed, 27 Dec 2023 11:22:05 +0800 Subject: [PATCH 3/7] Add blank --- .azure-pipelines/recover_testbed/testbed_status.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.azure-pipelines/recover_testbed/testbed_status.py b/.azure-pipelines/recover_testbed/testbed_status.py index f5d8b97a777..dfee81f9245 100644 --- a/.azure-pipelines/recover_testbed/testbed_status.py +++ b/.azure-pipelines/recover_testbed/testbed_status.py @@ -9,6 +9,7 @@ ADD_DEFAULT_IP_ROUTE = "sudo ip route add default via {}" # add /etc/network/interfaces -- ip, mask, gw + def dut_lose_management_ip(sonichost, conn_graph_facts, localhost, dutip): # Recover DUTs logger.info("=====Recover start=====") From 63aa17d8823a45a91a1dfb87da0c6afb9da2fec4 Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Wed, 27 Dec 2023 16:13:59 +0800 Subject: [PATCH 4/7] Add some todo in the code --- .azure-pipelines/recover_testbed/common.py | 21 ++++++------- .../recover_testbed/dut_connection.py | 3 -- .../recover_testbed/recover_testbed.py | 31 ++++++++++++------- .../recover_testbed/testbed_status.py | 2 +- 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/.azure-pipelines/recover_testbed/common.py b/.azure-pipelines/recover_testbed/common.py index 809428c14c1..beb5190da3c 100644 --- a/.azure-pipelines/recover_testbed/common.py +++ b/.azure-pipelines/recover_testbed/common.py @@ -26,10 +26,6 @@ INSTALL_SONIC_IMAGE = "onie-nos-install {}" # noqa E501 -def redeploy_testbed(sonichost, testbed_name): - logging.info("YT test redeploy testbed") - - def get_pdu_managers(sonichosts, conn_graph_facts): """Get PDU managers for all the devices to be upgraded. @@ -57,7 +53,6 @@ def get_pdu_managers(sonichosts, conn_graph_facts): def posix_shell_onie(dut_console, dutip, image_url): oldtty = termios.tcgetattr(sys.stdin) enter_onie_flag = 0 - # install_image_flag = False gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] try: tty.setraw(sys.stdin.fileno()) @@ -89,6 +84,7 @@ def posix_shell_onie(dut_console, dutip, image_url): enter_onie_flag += 1 if "ONIE: Starting ONIE Service Discovery" in x: + # TODO: Define a function to send command here for i in range(5): dut_console.remote_conn.send('onie-discovery-stop\n') dut_console.remote_conn.send("\n") @@ -101,11 +97,11 @@ def posix_shell_onie(dut_console, dutip, image_url): dut_console.remote_conn.send(INSTALL_SONIC_IMAGE.format(image_url)) dut_console.remote_conn.send("\n") - # why sleep + # We will wait some time to connect to image server time.sleep(60) x = dut_console.remote_conn.recv(1024) x = x.decode('ISO-8859-9') - # sample output + # TODO: Give a sample output here if "ETA" in x: break @@ -147,12 +143,15 @@ def posix_shell_aboot(dut_console, dutip, image_url): x = x.decode('ISO-8859-9') if install_image_flag: + # TODO: We can not exactly determine the string in buffer, + # TODO: in the future, maybe we will gather the buffer and then process them # "Press Control-C now to enter Aboot shell" if "Press" in x: dut_console.remote_conn.send("\x03") continue if "Aboot" in x and "#" in x: + # TODO: Define a function to send command here dut_console.remote_conn.send("ifconfig ma1 {} netmask 255.255.254.0".format(dutip)) dut_console.remote_conn.send("\n") @@ -218,12 +217,12 @@ def do_power_cycle(sonichost, conn_graph_facts, localhost): def check_sonic_installer(sonichost, sonic_username, sonic_password, sonic_ip, image_url): - client = pexpect.spawn('ssh {}@{} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'.format(sonic_username, sonic_ip)) # noqa E501 + client = pexpect.spawn('ssh {}@{} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' + .format(sonic_username, sonic_ip)) client.expect("admin@{}'s password:".format(sonic_ip)) client.sendline(sonic_password) client.expect(["admin@sonic", "admin@{}".format(sonichost.hostname)]) - client.sendline("sudo sonic-installer install {}".format(image_url)) # noqa E501 + client.sendline("sudo sonic-installer install {}" + .format(image_url)) client.expect("New image will be installed") client.close() - # client.sendline("y") - # client.expect("Downloading image...") diff --git a/.azure-pipelines/recover_testbed/dut_connection.py b/.azure-pipelines/recover_testbed/dut_connection.py index 2fefdb63981..fd9436f6696 100644 --- a/.azure-pipelines/recover_testbed/dut_connection.py +++ b/.azure-pipelines/recover_testbed/dut_connection.py @@ -133,9 +133,6 @@ def duthost_ssh(sonichost): ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(sonic_ip, username=sonic_username, password=password, allow_agent=False, look_for_keys=False, timeout=10) - _stdin, _stdout, _stderr = ssh.exec_command('show version') - prompt = ssh.invoke_shell() - logging.info("YT test prompt {}".format(prompt.recv(1000).decode())) ssh.close() return sonic_username, password, sonic_ip except AuthenticationException: diff --git a/.azure-pipelines/recover_testbed/recover_testbed.py b/.azure-pipelines/recover_testbed/recover_testbed.py index b39b2b4c717..561df5f2141 100644 --- a/.azure-pipelines/recover_testbed/recover_testbed.py +++ b/.azure-pipelines/recover_testbed/recover_testbed.py @@ -35,11 +35,14 @@ def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_ dut_console = duthost_console(sonichost, conn_graph_facts, localhost) do_power_cycle(sonichost, conn_graph_facts, localhost) - if hwsku == "Arista": + + type = hwsku.split('-')[0].lower() + + if type in ["arista"]: posix_shell_aboot(dut_console, sonic_ip, image_url) - elif hwsku == "Cisco": + elif type in ["Cisco"]: pass - else: + elif type in ["mellanox", "nexus", "acs"]: posix_shell_onie(dut_console, sonic_ip, image_url) dut_lose_management_ip(sonichost, conn_graph_facts, localhost, sonic_ip) @@ -49,10 +52,10 @@ def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_ def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url, hwsku): - # flag: need_to_recover for sonichost in sonichosts: sonic_username, sonic_password, sonic_ip = get_ssh_info(sonichost) - while True: # for i in range(3) + need_to_recover = False + for i in range(3): dut_ssh = duthost_ssh(sonichost) if type(dut_ssh) == tuple: @@ -63,18 +66,22 @@ def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url, hwsku): try: check_sonic_installer(sonichost, sonic_username, sonic_password, sonic_ip, image_url) - return - except Exception as e: # Exception + break + # TODO: specify which Exception it is + except Exception as e: logger.info("Exception caught while executing cmd. Error message: {}".format(e)) - recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku) - - elif dut_ssh == 1: # RC_SOCKET_TIMEOUT + need_to_recover = True + # TODO: Define the return message like RC_SOCKET_TIMEOUT in common file + elif dut_ssh == 1: # Do power cycle - recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku) + need_to_recover = True else: logger.info("Authentication failed. Passwords are incorrect.") return + if need_to_recover: + recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku) + def validate_args(args): _log_level_map = { @@ -173,7 +180,7 @@ def main(args): ) parser.add_argument( - "-h", "--hwsku", + "--hwsku", type=str, dest="hwsku", required=True, diff --git a/.azure-pipelines/recover_testbed/testbed_status.py b/.azure-pipelines/recover_testbed/testbed_status.py index dfee81f9245..3ac01479de4 100644 --- a/.azure-pipelines/recover_testbed/testbed_status.py +++ b/.azure-pipelines/recover_testbed/testbed_status.py @@ -7,7 +7,7 @@ ADD_MANAGEMENT_IP = "sudo ip addr add {}/23 brd {} dev eth0" ADD_DEFAULT_IP_ROUTE = "sudo ip route add default via {}" -# add /etc/network/interfaces -- ip, mask, gw +# TODO: Add mgmt ip into file /etc/network/interfaces -- ip, mask, gw def dut_lose_management_ip(sonichost, conn_graph_facts, localhost, dutip): From 746e457a72226c97182fb7b06750ec39faa0e090 Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Thu, 28 Dec 2023 11:00:29 +0800 Subject: [PATCH 5/7] Reserve recovery of cisco --- .azure-pipelines/recover_testbed/recover_testbed.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines/recover_testbed/recover_testbed.py b/.azure-pipelines/recover_testbed/recover_testbed.py index 561df5f2141..0e7e652dcbe 100644 --- a/.azure-pipelines/recover_testbed/recover_testbed.py +++ b/.azure-pipelines/recover_testbed/recover_testbed.py @@ -40,10 +40,12 @@ def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_ if type in ["arista"]: posix_shell_aboot(dut_console, sonic_ip, image_url) - elif type in ["Cisco"]: - pass + # elif type in ["Cisco"]: + # return elif type in ["mellanox", "nexus", "acs"]: posix_shell_onie(dut_console, sonic_ip, image_url) + else: + return dut_lose_management_ip(sonichost, conn_graph_facts, localhost, sonic_ip) except Exception as e: From 8b945e0eb9d2119b926fc296a6943b21c5b612dc Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Thu, 28 Dec 2023 15:38:33 +0800 Subject: [PATCH 6/7] Gw ip --- .azure-pipelines/recover_testbed/common.py | 22 +++++++++---------- .../recover_testbed/recover_testbed.py | 14 +++++++----- .../recover_testbed/testbed_status.py | 16 +++++--------- 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/.azure-pipelines/recover_testbed/common.py b/.azure-pipelines/recover_testbed/common.py index beb5190da3c..6c869c8a53d 100644 --- a/.azure-pipelines/recover_testbed/common.py +++ b/.azure-pipelines/recover_testbed/common.py @@ -21,10 +21,6 @@ logger = logging.getLogger(__name__) -ADD_MANAGEMENT_IP = "ifconfig eth0 {} netmask 255.255.254.0" -ADD_DEFAULT_IP_ROUTE = "ip route add default via {}" -INSTALL_SONIC_IMAGE = "onie-nos-install {}" # noqa E501 - def get_pdu_managers(sonichosts, conn_graph_facts): """Get PDU managers for all the devices to be upgraded. @@ -50,10 +46,10 @@ def get_pdu_managers(sonichosts, conn_graph_facts): return pdu_managers -def posix_shell_onie(dut_console, dutip, image_url): +def posix_shell_onie(dut_console, mgmt_ip, image_url): oldtty = termios.tcgetattr(sys.stdin) enter_onie_flag = 0 - gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] + gw_ip = list(ipaddress.ip_interface(mgmt_ip).network.hosts())[0] try: tty.setraw(sys.stdin.fileno()) tty.setcbreak(sys.stdin.fileno()) @@ -89,13 +85,14 @@ def posix_shell_onie(dut_console, dutip, image_url): dut_console.remote_conn.send('onie-discovery-stop\n') dut_console.remote_conn.send("\n") - dut_console.remote_conn.send(ADD_MANAGEMENT_IP.format(dutip)) + dut_console.remote_conn.send("ifconfig eth0 {} netmask {}".format(mgmt_ip.split('/')[0], + ipaddress.ip_interface(mgmt_ip).with_netmask)) dut_console.remote_conn.send("\n") - dut_console.remote_conn.send(ADD_DEFAULT_IP_ROUTE.format(gw_ip)) + dut_console.remote_conn.send("ip route add default via {}".format(gw_ip)) dut_console.remote_conn.send("\n") - dut_console.remote_conn.send(INSTALL_SONIC_IMAGE.format(image_url)) + dut_console.remote_conn.send("onie-nos-install {}".format(image_url)) dut_console.remote_conn.send("\n") # We will wait some time to connect to image server time.sleep(60) @@ -122,10 +119,10 @@ def posix_shell_onie(dut_console, dutip, image_url): termios.tcsetattr(sys.stdin, termios.TCSADRAIN, oldtty) -def posix_shell_aboot(dut_console, dutip, image_url): +def posix_shell_aboot(dut_console, mgmt_ip, image_url): oldtty = termios.tcgetattr(sys.stdin) install_image_flag = True - gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] + gw_ip = list(ipaddress.ip_interface(mgmt_ip).network.hosts())[0] try: tty.setraw(sys.stdin.fileno()) tty.setcbreak(sys.stdin.fileno()) @@ -152,7 +149,8 @@ def posix_shell_aboot(dut_console, dutip, image_url): if "Aboot" in x and "#" in x: # TODO: Define a function to send command here - dut_console.remote_conn.send("ifconfig ma1 {} netmask 255.255.254.0".format(dutip)) + dut_console.remote_conn.send("ifconfig ma1 {} netmask {}".format(mgmt_ip.split('/')[0], + ipaddress.ip_interface(mgmt_ip).with_netmask)) dut_console.remote_conn.send("\n") time.sleep(1) diff --git a/.azure-pipelines/recover_testbed/recover_testbed.py b/.azure-pipelines/recover_testbed/recover_testbed.py index 0e7e652dcbe..2705f4eddbe 100644 --- a/.azure-pipelines/recover_testbed/recover_testbed.py +++ b/.azure-pipelines/recover_testbed/recover_testbed.py @@ -30,7 +30,7 @@ """ -def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku): +def recover_via_console(sonichost, conn_graph_facts, localhost, mgmt_ip, image_url, hwsku): try: dut_console = duthost_console(sonichost, conn_graph_facts, localhost) @@ -39,15 +39,15 @@ def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_ type = hwsku.split('-')[0].lower() if type in ["arista"]: - posix_shell_aboot(dut_console, sonic_ip, image_url) + posix_shell_aboot(dut_console, mgmt_ip, image_url) # elif type in ["Cisco"]: # return elif type in ["mellanox", "nexus", "acs"]: - posix_shell_onie(dut_console, sonic_ip, image_url) + posix_shell_onie(dut_console, mgmt_ip, image_url) else: return - dut_lose_management_ip(sonichost, conn_graph_facts, localhost, sonic_ip) + dut_lose_management_ip(sonichost, conn_graph_facts, localhost, mgmt_ip) except Exception as e: logger.info(e) return @@ -55,7 +55,7 @@ def recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_ def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url, hwsku): for sonichost in sonichosts: - sonic_username, sonic_password, sonic_ip = get_ssh_info(sonichost) + # sonic_username, sonic_password, sonic_ip = get_ssh_info(sonichost) need_to_recover = False for i in range(3): dut_ssh = duthost_ssh(sonichost) @@ -81,8 +81,10 @@ def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url, hwsku): logger.info("Authentication failed. Passwords are incorrect.") return + # Get dut ip with network mask + mgmt_ip = conn_graph_facts["device_info"][sonichost.hostname]["ManagementIp"] if need_to_recover: - recover_via_console(sonichost, conn_graph_facts, localhost, sonic_ip, image_url, hwsku) + recover_via_console(sonichost, conn_graph_facts, localhost, mgmt_ip, image_url, hwsku) def validate_args(args): diff --git a/.azure-pipelines/recover_testbed/testbed_status.py b/.azure-pipelines/recover_testbed/testbed_status.py index 3ac01479de4..acedc315495 100644 --- a/.azure-pipelines/recover_testbed/testbed_status.py +++ b/.azure-pipelines/recover_testbed/testbed_status.py @@ -5,20 +5,16 @@ logger = logging.getLogger(__name__) -ADD_MANAGEMENT_IP = "sudo ip addr add {}/23 brd {} dev eth0" -ADD_DEFAULT_IP_ROUTE = "sudo ip route add default via {}" -# TODO: Add mgmt ip into file /etc/network/interfaces -- ip, mask, gw - - -def dut_lose_management_ip(sonichost, conn_graph_facts, localhost, dutip): +def dut_lose_management_ip(sonichost, conn_graph_facts, localhost, mgmt_ip): # Recover DUTs logger.info("=====Recover start=====") dut_console = duthost_console(sonichost, conn_graph_facts, localhost) - gw_ip = list(ipaddress.ip_interface("{}/23".format(dutip)).network.hosts())[0] - brd_ip = ipaddress.ip_interface("{}/23".format(dutip)).network.broadcast_address + gw_ip = list(ipaddress.ip_interface(mgmt_ip).network.hosts())[0] + brd_ip = ipaddress.ip_interface(mgmt_ip).network.broadcast_address try: - ret = dut_console.send_command(ADD_MANAGEMENT_IP.format(dutip, brd_ip)) # noqa F841 - dut_console.send_command(ADD_DEFAULT_IP_ROUTE.format(gw_ip)) + # TODO: Add mgmt ip into file /etc/network/interfaces -- ip, mask, gw + ret = dut_console.send_command("sudo ip addr add {} brd {} dev eth0".format(mgmt_ip, brd_ip)) # noqa F841 + dut_console.send_command("sudo ip route add default via {}".format(gw_ip)) except Exception as e: logging.info(e) finally: From 481edbc5909543c07b6a67b4f30e565fb6889946 Mon Sep 17 00:00:00 2001 From: Yutong Zhang Date: Thu, 28 Dec 2023 15:51:20 +0800 Subject: [PATCH 7/7] fix a bug --- .azure-pipelines/recover_testbed/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines/recover_testbed/common.py b/.azure-pipelines/recover_testbed/common.py index 6c869c8a53d..06613ff8e6d 100644 --- a/.azure-pipelines/recover_testbed/common.py +++ b/.azure-pipelines/recover_testbed/common.py @@ -86,7 +86,7 @@ def posix_shell_onie(dut_console, mgmt_ip, image_url): dut_console.remote_conn.send("\n") dut_console.remote_conn.send("ifconfig eth0 {} netmask {}".format(mgmt_ip.split('/')[0], - ipaddress.ip_interface(mgmt_ip).with_netmask)) + ipaddress.ip_interface(mgmt_ip).with_netmask.split('/')[1])) dut_console.remote_conn.send("\n") dut_console.remote_conn.send("ip route add default via {}".format(gw_ip)) @@ -150,7 +150,7 @@ def posix_shell_aboot(dut_console, mgmt_ip, image_url): if "Aboot" in x and "#" in x: # TODO: Define a function to send command here dut_console.remote_conn.send("ifconfig ma1 {} netmask {}".format(mgmt_ip.split('/')[0], - ipaddress.ip_interface(mgmt_ip).with_netmask)) + ipaddress.ip_interface(mgmt_ip).with_netmask.split('/')[1])) dut_console.remote_conn.send("\n") time.sleep(1)