diff --git a/.azure-pipelines/recover_testbed/common.py b/.azure-pipelines/recover_testbed/common.py new file mode 100644 index 00000000000..06613ff8e6d --- /dev/null +++ b/.azure-pipelines/recover_testbed/common.py @@ -0,0 +1,226 @@ +import os +import sys +import logging +import termios +import tty +import select +import socket +import time +import pexpect +import ipaddress + +_self_dir = os.path.dirname(os.path.abspath(__file__)) +base_path = os.path.realpath(os.path.join(_self_dir, "../..")) +if base_path not in sys.path: + sys.path.append(base_path) +ansible_path = os.path.realpath(os.path.join(_self_dir, "../../ansible")) +if ansible_path not in sys.path: + sys.path.append(ansible_path) + +from tests.common.plugins.pdu_controller.pdu_manager import pdu_manager_factory # noqa E402 + +logger = logging.getLogger(__name__) + + +def get_pdu_managers(sonichosts, conn_graph_facts): + """Get PDU managers for all the devices to be upgraded. + + Args: + sonichosts (SonicHosts): Instance of class SonicHosts + conn_graph_facts (dict): Connection graph dict. + + Returns: + dict: A dict of PDU managers. Key is device hostname. Value is the PDU manager object for the device. + """ + pdu_managers = {} + device_pdu_links = conn_graph_facts['device_pdu_links'] + device_pdu_info = conn_graph_facts['device_pdu_info'] + for hostname in sonichosts.hostnames: + pdu_links = device_pdu_links[hostname] + pdu_info = device_pdu_info[hostname] + pdu_vars = {} + for pdu_name in pdu_info.keys(): + pdu_vars[pdu_name] = sonichosts.get_host_visible_vars(pdu_name) + + pdu_managers[hostname] = pdu_manager_factory(hostname, pdu_links, pdu_info, pdu_vars) + return pdu_managers + + +def posix_shell_onie(dut_console, mgmt_ip, image_url): + oldtty = termios.tcgetattr(sys.stdin) + enter_onie_flag = 0 + gw_ip = list(ipaddress.ip_interface(mgmt_ip).network.hosts())[0] + try: + tty.setraw(sys.stdin.fileno()) + tty.setcbreak(sys.stdin.fileno()) + dut_console.remote_conn.settimeout(0.0) + + while True: + r, w, e = select.select([dut_console.remote_conn, sys.stdin], [], []) + if dut_console.remote_conn in r: + try: + x = dut_console.remote_conn.recv(65536) + if len(x) == 0: + sys.stdout.write("\r\n*** EOF\r\n") + break + + x = x.decode('ISO-8859-9') + + if "GNU GRUB" in x: + enter_onie_flag += 1 + continue + + if "SONiC-OS-" in x and enter_onie_flag == 1: + # Send arrow key "down" here. + dut_console.remote_conn.send(b'\x1b[B') + continue + + if "*ONIE" in x and "Install OS" not in x: + dut_console.remote_conn.send("\n") + enter_onie_flag += 1 + + if "ONIE: Starting ONIE Service Discovery" in x: + # TODO: Define a function to send command here + for i in range(5): + dut_console.remote_conn.send('onie-discovery-stop\n') + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send("ifconfig eth0 {} netmask {}".format(mgmt_ip.split('/')[0], + ipaddress.ip_interface(mgmt_ip).with_netmask.split('/')[1])) + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send("ip route add default via {}".format(gw_ip)) + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send("onie-nos-install {}".format(image_url)) + dut_console.remote_conn.send("\n") + # We will wait some time to connect to image server + time.sleep(60) + x = dut_console.remote_conn.recv(1024) + x = x.decode('ISO-8859-9') + # TODO: Give a sample output here + if "ETA" in x: + break + + if "sonic login:" in x: + dut_console.remote_conn.close() + + sys.stdout.write(x) + sys.stdout.flush() + except socket.timeout: + pass + if sys.stdin in r: + x = sys.stdin.read(1) + if len(x) == 0: + break + dut_console.remote_conn.send(x) + + finally: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, oldtty) + + +def posix_shell_aboot(dut_console, mgmt_ip, image_url): + oldtty = termios.tcgetattr(sys.stdin) + install_image_flag = True + gw_ip = list(ipaddress.ip_interface(mgmt_ip).network.hosts())[0] + try: + tty.setraw(sys.stdin.fileno()) + tty.setcbreak(sys.stdin.fileno()) + dut_console.remote_conn.settimeout(0.0) + + while True: + r, w, e = select.select([dut_console.remote_conn, sys.stdin], [], []) + if dut_console.remote_conn in r: + try: + x = dut_console.remote_conn.recv(65536) + if len(x) == 0: + sys.stdout.write("\r\n*** EOF\r\n") + break + + x = x.decode('ISO-8859-9') + + if install_image_flag: + # TODO: We can not exactly determine the string in buffer, + # TODO: in the future, maybe we will gather the buffer and then process them + # "Press Control-C now to enter Aboot shell" + if "Press" in x: + dut_console.remote_conn.send("\x03") + continue + + if "Aboot" in x and "#" in x: + # TODO: Define a function to send command here + dut_console.remote_conn.send("ifconfig ma1 {} netmask {}".format(mgmt_ip.split('/')[0], + ipaddress.ip_interface(mgmt_ip).with_netmask.split('/')[1])) + dut_console.remote_conn.send("\n") + + time.sleep(1) + + dut_console.remote_conn.send("route add default gw {}".format(gw_ip)) + dut_console.remote_conn.send("\n") + + time.sleep(1) + + dut_console.remote_conn.send("ip route add default via {} dev ma1".format(gw_ip)) + dut_console.remote_conn.send("\n") + + time.sleep(1) + + dut_console.remote_conn.send("wget {}".format(image_url)) + dut_console.remote_conn.send("\n") + + for i in range(5): + time.sleep(10) + x = dut_console.remote_conn.recv(1024) + x = x.decode('ISO-8859-9') + if "ETA" in x: + break + + dut_console.remote_conn.send("echo 'SWI=flash:{}' > boot-config" + .format(image_url.split("/")[-1])) + dut_console.remote_conn.send("\n") + + dut_console.remote_conn.send("reboot") + dut_console.remote_conn.send("\n") + + install_image_flag = False + + if "login:" in x: + dut_console.remote_conn.close() + + sys.stdout.write(x) + sys.stdout.flush() + except socket.timeout: + pass + if sys.stdin in r: + x = sys.stdin.read(1) + if len(x) == 0: + break + dut_console.remote_conn.send(x) + + finally: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, oldtty) + + +def do_power_cycle(sonichost, conn_graph_facts, localhost): + pdu_managers = get_pdu_managers(sonichost, conn_graph_facts) + + for hostname, pdu_manager in pdu_managers.items(): + logger.info("Turn off power outlets to {}".format(hostname)) + pdu_manager.turn_off_outlet() + localhost.pause(seconds=30, prompt="Pause between power off/on") + + for hostname, pdu_manager in pdu_managers.items(): + logger.info("Turn on power outlets to {}".format(hostname)) + pdu_manager.turn_on_outlet() + + +def check_sonic_installer(sonichost, sonic_username, sonic_password, sonic_ip, image_url): + client = pexpect.spawn('ssh {}@{} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' + .format(sonic_username, sonic_ip)) + client.expect("admin@{}'s password:".format(sonic_ip)) + client.sendline(sonic_password) + client.expect(["admin@sonic", "admin@{}".format(sonichost.hostname)]) + client.sendline("sudo sonic-installer install {}" + .format(image_url)) + client.expect("New image will be installed") + client.close() diff --git a/.azure-pipelines/recover_testbed/dut_connection.py b/.azure-pipelines/recover_testbed/dut_connection.py new file mode 100644 index 00000000000..fd9436f6696 --- /dev/null +++ b/.azure-pipelines/recover_testbed/dut_connection.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +import logging +import os +import sys +import paramiko +import socket +import glob +import re +import yaml +import jinja2 +from tests.common.connections.console_host import ConsoleHost +from paramiko.ssh_exception import AuthenticationException + +_self_dir = os.path.dirname(os.path.abspath(__file__)) +base_path = os.path.realpath(os.path.join(_self_dir, "../..")) +if base_path not in sys.path: + sys.path.append(base_path) +ansible_path = os.path.realpath(os.path.join(_self_dir, "../../ansible")) +if ansible_path not in sys.path: + sys.path.append(ansible_path) + +logger = logging.getLogger(__name__) + +RC_SSH_SUCCESS = 0 +RC_SOCKET_TIMEOUT = 1 +RC_PASSWORD_FAILED = 2 + + +def creds_on_dut(sonichost): + groups = sonichost.im.get_host(sonichost.hostname).get_vars()['group_names'] + groups.append("fanout") + logger.info("dut {} belongs to groups {}".format(sonichost.hostname, groups)) + exclude_regex_patterns = [ + r'topo_.*\.yml', + r'breakout_speed\.yml', + r'lag_fanout_ports_test_vars\.yml', + r'qos\.yml', + r'sku-sensors-data\.yml', + r'mux_simulator_http_port_map\.yml' + ] + files = glob.glob("group_vars/all/*.yml") + files += glob.glob("vars/*.yml") + for group in groups: + files += glob.glob("group_vars/{}/*.yml".format(group)) + filtered_files = [ + f for f in files if not re.search('|'.join(exclude_regex_patterns), f) + ] + + creds = {} + for f in filtered_files: + with open(f) as stream: + v = yaml.safe_load(stream) + if v is not None: + creds.update(v) + else: + logger.info("skip empty var file {}".format(f)) + + cred_vars = [ + "sonicadmin_user", + "sonicadmin_password", + "docker_registry_host", + "docker_registry_username", + "docker_registry_password", + "public_docker_registry_host" + ] + + hostvars = sonichost.vm.get_vars( + host=sonichost.im.get_hosts(pattern='sonic')[0]) + + for cred_var in cred_vars: + if cred_var in creds: + creds[cred_var] = jinja2.Template(creds[cred_var]).render(**hostvars) + + if "console_login" not in list(hostvars.keys()): + console_login_creds = {} + else: + console_login_creds = hostvars["console_login"] + + creds["console_user"] = {} + creds["console_password"] = {} + for k, v in list(console_login_creds.items()): + creds["console_user"][k] = v["user"] + creds["console_password"][k] = v["passwd"] + return creds + + +def get_console_info(sonichost, conn_graph_facts): + console_host = conn_graph_facts['device_console_info'][sonichost.hostname]['ManagementIp'] + console_port = conn_graph_facts['device_console_link'][sonichost.hostname]['ConsolePort']['peerport'] + console_type = conn_graph_facts['device_console_link'][sonichost.hostname]['ConsolePort']['type'] + console_username = conn_graph_facts['device_console_link'][sonichost.hostname]['ConsolePort']['proxy'] + + return console_host, console_port, console_type, console_username + + +def get_ssh_info(sonichost): + creds = creds_on_dut(sonichost) + sonic_username = creds['sonicadmin_user'] + sonicadmin_alt_password = sonichost.vm.get_vars( + host=sonichost.im.get_hosts(pattern='sonic')[0]).get("ansible_altpassword") + sonic_password = [creds['sonicadmin_password'], sonicadmin_alt_password] + sonic_ip = sonichost.im.get_host(sonichost.hostname).vars['ansible_host'] + logging.info("sonic username: {}, password: {}".format(sonic_username, sonic_password)) + return sonic_username, sonic_password, sonic_ip + + +def duthost_console(sonichost, conn_graph_facts, localhost): + console_host, console_port, console_type, console_username = get_console_info(sonichost, conn_graph_facts) + console_type = "console_" + console_type + + # console password and sonic_password are lists, which may contain more than one password + sonicadmin_alt_password = sonichost.vm.get_vars( + host=sonichost.im.get_hosts(pattern='sonic')[0]).get("ansible_altpassword") + creds = creds_on_dut(sonichost) + + host = ConsoleHost(console_type=console_type, + console_host=console_host, + console_port=console_port, + sonic_username=creds['sonicadmin_user'], + sonic_password=[creds['sonicadmin_password'], sonicadmin_alt_password], + console_username=console_username, + console_password=creds['console_password'][console_type]) + + return host + + +def duthost_ssh(sonichost): + sonic_username, sonic_passwords, sonic_ip = get_ssh_info(sonichost) + for password in sonic_passwords: + try: + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + ssh.connect(sonic_ip, username=sonic_username, password=password, + allow_agent=False, look_for_keys=False, timeout=10) + ssh.close() + return sonic_username, password, sonic_ip + except AuthenticationException: + continue + except socket.timeout as e: + logger.info("Cannot access DUT {} via ssh, error: {}".format(sonichost.hostname, e)) + return RC_SOCKET_TIMEOUT + return RC_PASSWORD_FAILED diff --git a/.azure-pipelines/recover_testbed/recover_testbed.py b/.azure-pipelines/recover_testbed/recover_testbed.py new file mode 100644 index 00000000000..2705f4eddbe --- /dev/null +++ b/.azure-pipelines/recover_testbed/recover_testbed.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 + +import argparse +import logging +import os +import sys +from common import do_power_cycle, check_sonic_installer, posix_shell_aboot, posix_shell_onie + +_self_dir = os.path.dirname(os.path.abspath(__file__)) +base_path = os.path.realpath(os.path.join(_self_dir, "../..")) +if base_path not in sys.path: + sys.path.append(base_path) +ansible_path = os.path.realpath(os.path.join(_self_dir, "../../ansible")) +if ansible_path not in sys.path: + sys.path.append(ansible_path) + +from devutil.devices.factory import init_localhost, init_testbed_sonichosts # noqa E402 +from dut_connection import duthost_ssh, duthost_console, get_ssh_info # noqa E402 +from testbed_status import dut_lose_management_ip # noqa F401 + +logger = logging.getLogger(__name__) + +RC_INIT_FAILED = 1 + +""" +This script must be run under folder "sonic-mgmt/ansible" +SSH connection success: check if sonic-installer is usable, if not, do power cycle. + +If console fails, do power cycle +""" + + +def recover_via_console(sonichost, conn_graph_facts, localhost, mgmt_ip, image_url, hwsku): + try: + dut_console = duthost_console(sonichost, conn_graph_facts, localhost) + + do_power_cycle(sonichost, conn_graph_facts, localhost) + + type = hwsku.split('-')[0].lower() + + if type in ["arista"]: + posix_shell_aboot(dut_console, mgmt_ip, image_url) + # elif type in ["Cisco"]: + # return + elif type in ["mellanox", "nexus", "acs"]: + posix_shell_onie(dut_console, mgmt_ip, image_url) + else: + return + + dut_lose_management_ip(sonichost, conn_graph_facts, localhost, mgmt_ip) + except Exception as e: + logger.info(e) + return + + +def recover_testbed(sonichosts, conn_graph_facts, localhost, image_url, hwsku): + for sonichost in sonichosts: + # sonic_username, sonic_password, sonic_ip = get_ssh_info(sonichost) + need_to_recover = False + for i in range(3): + dut_ssh = duthost_ssh(sonichost) + + if type(dut_ssh) == tuple: + logger.info("SSH success.") + sonic_username = dut_ssh[0] + sonic_password = dut_ssh[1] + sonic_ip = dut_ssh[2] + + try: + check_sonic_installer(sonichost, sonic_username, sonic_password, sonic_ip, image_url) + break + # TODO: specify which Exception it is + except Exception as e: + logger.info("Exception caught while executing cmd. Error message: {}".format(e)) + need_to_recover = True + # TODO: Define the return message like RC_SOCKET_TIMEOUT in common file + elif dut_ssh == 1: + # Do power cycle + need_to_recover = True + else: + logger.info("Authentication failed. Passwords are incorrect.") + return + + # Get dut ip with network mask + mgmt_ip = conn_graph_facts["device_info"][sonichost.hostname]["ManagementIp"] + if need_to_recover: + recover_via_console(sonichost, conn_graph_facts, localhost, mgmt_ip, image_url, hwsku) + + +def validate_args(args): + _log_level_map = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL + } + logging.basicConfig( + stream=sys.stdout, + level=_log_level_map[args.log_level], + format="%(asctime)s %(filename)s#%(lineno)d %(levelname)s - %(message)s" + ) + + +def main(args): + logger.info("Validating arguments") + validate_args(args) + + logger.info("Initializing hosts") + localhost = init_localhost(args.inventory, options={"verbosity": args.verbosity}) + sonichosts = init_testbed_sonichosts( + args.inventory, args.testbed_name, testbed_file=args.tbfile, options={"verbosity": args.verbosity} + ) + + if not localhost or not sonichosts: + sys.exit(RC_INIT_FAILED) + + conn_graph_facts = localhost.conn_graph_facts( + hosts=sonichosts.hostnames, + filepath=os.path.join(ansible_path, "files") + )["ansible_facts"] + + recover_testbed(sonichosts, conn_graph_facts, localhost, args.image, args.hwsku) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Tool for getting sonic device version.") + + parser.add_argument( + "-i", "--inventory", + dest="inventory", + nargs="+", + help="Ansible inventory file") + + parser.add_argument( + "-t", "--testbed-name", + type=str, + required=True, + dest="testbed_name", + help="Testbed name." + ) + + parser.add_argument( + "--tbfile", + type=str, + dest="tbfile", + default="testbed.yaml", + help="Testbed definition file." + ) + + parser.add_argument( + "-v", "--verbosity", + type=int, + dest="verbosity", + default=2, + help="Log verbosity (0-3)." + ) + + parser.add_argument( + "--log-level", + type=str, + dest="log_level", + choices=["debug", "info", "warning", "error", "critical"], + default="debug", + help="Loglevel" + ) + + parser.add_argument( + "-o", "--output", + type=str, + dest="output", + required=False, + help="Output duts version to the specified file." + ) + + parser.add_argument( + "--image", + type=str, + dest="image", + required=True, + help="The image url" + ) + + parser.add_argument( + "--hwsku", + type=str, + dest="hwsku", + required=True, + help="Hwsku of DUT" + ) + + args = parser.parse_args() + main(args) diff --git a/.azure-pipelines/recover_testbed/testbed_status.py b/.azure-pipelines/recover_testbed/testbed_status.py new file mode 100644 index 00000000000..acedc315495 --- /dev/null +++ b/.azure-pipelines/recover_testbed/testbed_status.py @@ -0,0 +1,22 @@ +import logging +import ipaddress +from dut_connection import duthost_ssh, duthost_console # noqa E402 + +logger = logging.getLogger(__name__) + + +def dut_lose_management_ip(sonichost, conn_graph_facts, localhost, mgmt_ip): + # Recover DUTs + logger.info("=====Recover start=====") + dut_console = duthost_console(sonichost, conn_graph_facts, localhost) + gw_ip = list(ipaddress.ip_interface(mgmt_ip).network.hosts())[0] + brd_ip = ipaddress.ip_interface(mgmt_ip).network.broadcast_address + try: + # TODO: Add mgmt ip into file /etc/network/interfaces -- ip, mask, gw + ret = dut_console.send_command("sudo ip addr add {} brd {} dev eth0".format(mgmt_ip, brd_ip)) # noqa F841 + dut_console.send_command("sudo ip route add default via {}".format(gw_ip)) + except Exception as e: + logging.info(e) + finally: + logger.info("=====Recover finish=====") + dut_console.disconnect()