diff --git a/ansible/roles/test/files/ptftests/advanced-reboot.py b/ansible/roles/test/files/ptftests/advanced-reboot.py index 9e05c0cf09c..71943b390d0 100644 --- a/ansible/roles/test/files/ptftests/advanced-reboot.py +++ b/ansible/roles/test/files/ptftests/advanced-reboot.py @@ -144,6 +144,7 @@ def __init__(self): self.check_param('dut_stabilize_secs', 30, required=False) self.check_param('preboot_files', None, required = False) self.check_param('preboot_oper', None, required = False) + self.check_param('allow_vlan_flooding', False, required = False) if not self.test_params['preboot_oper'] or self.test_params['preboot_oper'] == 'None': self.test_params['preboot_oper'] = None @@ -191,6 +192,8 @@ def __init__(self): # second is the fast send_in_background self.dataplane_io_lock = threading.Lock() + self.allow_vlan_flooding = bool(self.test_params['allow_vlan_flooding']) + return def read_json(self, name): @@ -1139,6 +1142,8 @@ def wait_dut_to_warm_up(self): if not self.asic_state.is_flooding() and elapsed > dut_stabilize_secs: break if elapsed > warm_up_timeout_secs: + if self.allow_vlan_flooding: + break raise Exception("Data plane didn't stop flooding within warm up timeout") time.sleep(1) diff --git a/ansible/roles/test/files/reboot/fast-reboot b/ansible/roles/test/files/reboot/fast-reboot new file mode 100755 index 00000000000..d2383f35dac --- /dev/null +++ b/ansible/roles/test/files/reboot/fast-reboot @@ -0,0 +1,484 @@ +#!/bin/bash -e + +REBOOT_USER=$(logname) +REBOOT_TIME=$(date) +REBOOT_CAUSE_FILE="/host/reboot-cause/reboot-cause.txt" +WARM_DIR=/host/warmboot +REDIS_FILE=dump.rdb +REBOOT_SCRIPT_NAME=$(basename $0) +REBOOT_TYPE="${REBOOT_SCRIPT_NAME}" +VERBOSE=no +FORCE=no +REBOOT_METHOD="/sbin/kexec -e" +ASSISTANT_IP_LIST="" +ASSISTANT_SCRIPT="/usr/bin/neighbor_advertiser" + +# Require 100M available on the hard drive for warm reboot temp files, +# Size is in 1K blocks: +MIN_HD_SPACE_NEEDED=100000 + +EXIT_SUCCESS=0 +EXIT_FAILURE=1 +EXIT_NOT_SUPPORTED=2 +EXIT_FILE_SYSTEM_FULL=3 +EXIT_NEXT_IMAGE_NOT_EXISTS=4 +EXIT_ORCHAGENT_SHUTDOWN=10 +EXIT_SYNCD_SHUTDOWN=11 +EXIT_FAST_REBOOT_DUMP_FAILURE=12 + +function error() +{ + echo $@ >&2 +} + +function debug() +{ + if [[ x"${VERBOSE}" == x"yes" ]]; then + echo `date` $@ + fi + logger "$@" +} + +function showHelpAndExit() +{ + echo "Usage: ${REBOOT_SCRIPT_NAME} [options]" + echo " -h,-? : get this help" + echo " -v : turn on verbose" + echo " -f : force execution" + echo " -r : reboot with /sbin/reboot" + echo " -k : reboot with /sbin/kexec -e [default]" + echo " -x : execute script with -x flag" + echo " -c : specify control plane assistant IP list" + + exit "${EXIT_SUCCESS}" +} + +function parseOptions() +{ + while getopts "vfh?rkxc:" opt; do + case ${opt} in + h|\? ) + showHelpAndExit + ;; + v ) + VERBOSE=yes + ;; + f ) + FORCE=yes + ;; + r ) + REBOOT_METHOD="/sbin/reboot" + ;; + k ) + REBOOT_METHOD="/sbin/kexec -e" + ;; + x ) + set -x + ;; + c ) + ASSISTANT_IP_LIST=${OPTARG} + ;; + esac + done +} + +function clear_fast_boot() +{ + debug "${REBOOT_TYPE} failure ($?) cleanup ..." + + /sbin/kexec -u || /bin/true + + teardown_control_plane_assistant +} + +function clear_warm_boot() +{ + clear_fast_boot + + result=`timeout 10s config warm_restart disable; if [[ $? == 124 ]]; then echo timeout; else echo "code ($?)"; fi` || /bin/true + debug "Cancel warm-reboot: ${result}" + + TIMESTAMP=`date +%Y%m%d-%H%M%S` + if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then + mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true + fi +} + +function init_warm_reboot_states() +{ + # If the current running instanace was booted up with warm reboot. Then + # the current DB contents will likely mark warm reboot is done. + # Clear these states so that the next boot up image won't get confused. + if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + redis-cli -n 6 eval " + for _, key in ipairs(redis.call('keys', 'WARM_RESTART_TABLE|*')) do + redis.call('hdel', key, 'state') + end + " 0 >/dev/null + fi +} + +function initialize_pre_shutdown() +{ + debug "Initialize pre-shutdown ..." + TABLE="WARM_RESTART_TABLE|warm-shutdown" + RESTORE_COUNT=`/usr/bin/redis-cli -n 6 hget "${TABLE}" restore_count` + if [[ -z "$RESTORE_COUNT" ]]; then + /usr/bin/redis-cli -n 6 hset "${TABLE}" "restore_count" "0" > /dev/null + fi + /usr/bin/redis-cli -n 6 hset "${TABLE}" "state" "requesting" > /dev/null +} + +function request_pre_shutdown() +{ + debug "Requesting pre-shutdown ..." + /usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || { + error "Failed to request pre-shutdown" + exit "${EXIT_SYNCD_SHUTDOWN}" + } +} + +function wait_for_pre_shutdown_complete_or_fail() +{ + debug "Waiting for pre-shutdown ..." + TABLE="WARM_RESTART_TABLE|warm-shutdown" + STATE="requesting" + declare -i waitcount + declare -i retrycount + waitcount=0 + retrycount=0 + # Wait up to 60 seconds for pre-shutdown to complete + while [[ ${waitcount} -lt 600 ]]; do + # timeout doesn't work with -i option of "docker exec". Therefore we have + # to invoke docker exec directly below. + STATE=`timeout 5s docker exec database redis-cli -n 6 hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi` + + if [[ x"${STATE}" == x"timed out" ]]; then + waitcount+=50 + retrycount+=1 + debug "Timed out getting pre-shutdown state (${waitcount}) retry count ${retrycount} ..." + if [[ retrycount -gt 2 ]]; then + break + fi + elif [[ x"${STATE}" != x"requesting" ]]; then + break + else + sleep 0.1 + waitcount+=1 + fi + done + + if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then + debug "Syncd pre-shutdown failed: ${STATE} ..." + exit "${EXIT_SYNCD_SHUTDOWN}" + fi + debug "Pre-shutdown succeeded ..." +} + +function backup_database() +{ + debug "Backing up database ..." + # Dump redis content to a file 'dump.rdb' in warmboot directory + mkdir -p $WARM_DIR + # Delete keys in stateDB except FDB_TABLE|* and WARM_RESTA* + redis-cli -n 6 eval " + for _, k in ipairs(redis.call('keys', '*')) do + if not string.match(k, 'FDB_TABLE|') and not string.match(k, 'WARM_RESTART_TABLE|') \ + and not string.match(k, 'WARM_RESTART_ENABLE_TABLE|') then + redis.call('del', k) + end + end + " 0 > /dev/null + redis-cli save > /dev/null + docker cp database:/var/lib/redis/$REDIS_FILE $WARM_DIR + docker exec -i database rm /var/lib/redis/$REDIS_FILE +} + +function setup_control_plane_assistant() +{ + if [[ -n "${ASSISTANT_IP_LIST}" && -x ${ASSISTANT_SCRIPT} ]]; then + debug "Setting up control plane assistant: ${ASSISTANT_IP_LIST} ..." + ${ASSISTANT_SCRIPT} -s ${ASSISTANT_IP_LIST} -m set + fi +} + +function teardown_control_plane_assistant() +{ + if [[ -n "${ASSISTANT_IP_LIST}" && -x ${ASSISTANT_SCRIPT} ]]; then + debug "Tearing down control plane assistant: ${ASSISTANT_IP_LIST} ..." + ${ASSISTANT_SCRIPT} -s ${ASSISTANT_IP_LIST} -m reset + fi +} + +function setup_reboot_variables() +{ + # Kernel and initrd image + CURRENT_SONIC_IMAGE=$(sonic_installer list | grep "Current: " | cut -d ' ' -f 2) + NEXT_SONIC_IMAGE=$(sonic_installer list | grep "Next: " | cut -d ' ' -f 2) + IMAGE_PATH="/host/image-${NEXT_SONIC_IMAGE#SONiC-OS-}" + if grep -q aboot_platform= /host/machine.conf; then + KERNEL_IMAGE="$(ls $IMAGE_PATH/boot/vmlinuz-*)" + BOOT_OPTIONS="$(cat "$IMAGE_PATH/kernel-cmdline" | tr '\n' ' ') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" + elif grep -q onie_platform= /host/machine.conf; then + KERNEL_OPTIONS=$(cat /host/grub/grub.cfg | sed "/$NEXT_SONIC_IMAGE'/,/}/"'!'"g" | grep linux) + KERNEL_IMAGE="/host$(echo $KERNEL_OPTIONS | cut -d ' ' -f 2)" + BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" + else + error "Unknown bootloader. ${REBOOT_TYPE} is not supported." + exit "${EXIT_NOT_SUPPORTED}" + fi + INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g') +} + +function reboot_pre_check() +{ + # Make sure that the file system is normal: read-write able + filename="/host/test-`date +%Y%m%d-%H%M%S`" + if [[ ! -f ${filename} ]]; then + touch ${filename} + fi + rm ${filename} + + # Make sure /host has enough space for warm reboot temp files + avail=$(df -k /host | tail -1 | awk '{ print $4 }') + if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then + debug "/host has ${avail}K bytes available, not enough for warm reboot." + exit ${EXIT_FILE_SYSTEM_FULL} + fi + + # Make sure that the next image exists + if [[ ! -d ${IMAGE_PATH} ]]; then + debug "Next image ${NEXT_SONIC_IMAGE} doesn't exist ..." + exit ${EXIT_NEXT_IMAGE_NOT_EXISTS} + fi +} + +function unload_kernel() +{ + # Unload the previously loaded kernel if any loaded + if [[ "$(cat /sys/kernel/kexec_loaded)" -eq 1 ]]; then + /sbin/kexec -u + fi +} + +# main starts here +parseOptions $@ + +# Check root privileges +if [[ "$EUID" -ne 0 ]] +then + echo "This command must be run as root" >&2 + exit "${EXIT_FAILURE}" +fi + +sonic_asic_type=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type) + +# Check reboot type supported +BOOT_TYPE_ARG="cold" +case "$REBOOT_TYPE" in + "fast-reboot") + BOOT_TYPE_ARG=$REBOOT_TYPE + trap clear_fast_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM + ;; + "warm-reboot") + if [[ "$sonic_asic_type" == "mellanox" ]]; then + REBOOT_TYPE="fastfast-reboot" + BOOT_TYPE_ARG="fastfast" + # source mlnx-ffb.sh file with + # functions to check ISSU upgrade possibility + source mlnx-ffb.sh + else + BOOT_TYPE_ARG="warm" + fi + trap clear_warm_boot EXIT HUP INT QUIT TERM KILL ABRT ALRM + config warm_restart enable system + ;; + *) + error "Not supported reboot type: $REBOOT_TYPE" + exit "${EXIT_NOT_SUPPORTED}" + ;; +esac + +# Stopping all SLB neighbors if they're presented +PASSIVE_BGP_NEIGHBORS=$(sonic-cfggen -m /etc/sonic/minigraph.xml -v "BGP_PEER_RANGE | list") +case "$PASSIVE_BGP_NEIGHBORS" in + *BGPSLBPassive*) + ASN=$(sonic-cfggen -m /etc/sonic/minigraph.xml -v "DEVICE_METADATA['localhost']['bgp_asn']") + SLBNAME=$(sonic-cfggen -m /etc/sonic/minigraph.xml -v "BGP_PEER_RANGE['BGPSLBPassive']['name']") + vtysh -e "configure terminal" -e "router bgp ${ASN}" -e "neighbor ${SLBNAME} shutdown" + sleep 1 + ;; + *) + ;; +esac + +unload_kernel + +setup_reboot_variables + +reboot_pre_check + +# Install new FW for mellanox platforms before control plane goes down +# So on boot switch will not spend time to upgrade FW increasing the CP downtime +if [[ "$sonic_asic_type" == "mellanox" ]]; then + MLNX_EXIT_SUCCESS=0 + MLNX_EXIT_FW_ERROR=100 + MLNX_EXIT_FFB_FAILURE=101 + + MLNX_FW_UPGRADE_SCRIPT="/usr/bin/mlnx-fw-upgrade.sh" + + + if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + check_ffb || { + error "Warm reboot is not supported" + exit "${MLNX_EXIT_FFB_FAILURE}" + } + fi + + debug "Prepare MLNX ASIC to ${REBOOT_TYPE}: install new FW if required" + + ${MLNX_FW_UPGRADE_SCRIPT} --upgrade + MLNX_EXIT_CODE="$?" + if [[ "${MLNX_EXIT_CODE}" != "${MLNX_EXIT_SUCCESS}" ]]; then + error "Failed to burn MLNX FW: errno=${MLNX_EXIT_CODE}" + exit "${MLNX_EXIT_FW_ERROR}" + fi +fi + +# Load kernel into the memory +/sbin/kexec -l "$KERNEL_IMAGE" --initrd="$INITRD" --append="$BOOT_OPTIONS" + +if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then + # Dump the ARP and FDB tables to files also as default routes for both IPv4 and IPv6 + # into /host/fast-reboot + mkdir -p /host/fast-reboot + FAST_REBOOT_DUMP_RC=0 + /usr/bin/fast-reboot-dump.py -t /host/fast-reboot || FAST_REBOOT_DUMP_RC=$? + if [[ FAST_REBOOT_DUMP_RC -ne 0 ]]; then + error "Failed to run fast-reboot-dump.py. Exit code: $FAST_REBOOT_DUMP_RC" + unload_kernel + exit "${EXIT_FAST_REBOOT_DUMP_FAILURE}" + fi +fi + +init_warm_reboot_states + +setup_control_plane_assistant + +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + # Freeze orchagent for warm restart + # Ask orchagent_restart_check to try freeze 5 times with interval of 2 seconds, + # it is possible that the orchagent is in transient state and no opportunity to be freezed + # Note: assume that 2*5 seconds is enough for orchagent to process the request and respone freeze or not + debug "Pausing orchagent ..." + docker exec -i swss /usr/bin/orchagent_restart_check -w 2000 -r 5 > /dev/null || RESTARTCHECK_RC=$? + if [[ RESTARTCHECK_RC -ne 0 ]]; then + error "RESTARTCHECK failed" + if [[ x"${FORCE}" == x"yes" ]]; then + debug "Ignoring orchagent pausing failure ..." + else + exit "${EXIT_ORCHAGENT_SHUTDOWN}" + fi + fi +fi + +# Kill bgpd to start the bgp graceful restart procedure +debug "Stopping bgp ..." +docker exec -i bgp pkill -9 zebra +docker exec -i bgp pkill -9 bgpd || [ $? == 1 ] +debug "Stopped bgp ..." + +# Kill lldp, otherwise it sends informotion about reboot +docker kill lldp > /dev/null + +if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then + # Kill teamd, otherwise it gets down all LAGs + # Note: teamd must be killed before syncd, because it will send the last packet through CPU port + # TODO: stop teamd gracefully to allow teamd to send last valid update to be sure we'll have 90 seconds reboot time + docker kill teamd > /dev/null +fi + +# Kill swss dockers +docker kill swss > /dev/null + +# Pre-shutdown syncd +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + initialize_pre_shutdown + + request_pre_shutdown + + wait_for_pre_shutdown_complete_or_fail + + # Warm reboot: dump state to host disk + if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + redis-cli -n 1 FLUSHDB > /dev/null + redis-cli -n 2 FLUSHDB > /dev/null + redis-cli -n 5 FLUSHDB > /dev/null + fi + + # TODO: backup_database preserves FDB_TABLE + # need to cleanup as well for fastfast boot case + backup_database +fi + +# Stop teamd gracefully +if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then + debug "Stopping teamd ..." + # Send USR1 signal to all teamd instances to stop them + # It will prepare teamd for warm-reboot + # Note: We must send USR1 signal before syncd, because it will send the last packet through CPU port + docker exec -i teamd pkill -USR1 teamd || [ $? == 1 ] > /dev/null + debug "Stopped teamd ..." +fi + +if [[ "$sonic_asic_type" = 'broadcom' ]]; then + debug "Stopping syncd on ${CURRENT_SONIC_IMAGE} ..." + if [[ ${CURRENT_SONIC_IMAGE} =~ "20180330" ]]; then + # Gracefully stop syncd + docker exec -i syncd /usr/bin/syncd_request_shutdown --cold > /dev/null + + # Check that syncd was stopped + while docker top syncd | grep -q /usr/bin/syncd + do + sleep 0.1 + done + else + systemctl stop syncd + fi + debug "Stopped syncd ..." +fi + +# Kill other containers to make the reboot faster +docker ps -q | xargs docker kill > /dev/null + +# Stop the docker container engine. Otherwise we will have a broken docker storage +systemctl stop docker.service + +# Stop kernel modules for Nephos platform +if [[ "$sonic_asic_type" = 'nephos' ]]; +then + systemctl stop nps-modules-`uname -r`.service +fi + +# Update the reboot cause file to reflect that user issued this script +# Upon next boot, the contents of this file will be used to determine the +# cause of the previous reboot +echo "User issued '${REBOOT_SCRIPT_NAME}' command [User: ${REBOOT_USER}, Time: ${REBOOT_TIME}]" > ${REBOOT_CAUSE_FILE} + +# Wait until all buffers synced with disk +sync +sleep 1 +sync + +# sync the current system time to CMOS +if [ -x /sbin/hwclock ]; then + /sbin/hwclock -w || /bin/true +fi + +# Reboot: explicity call Linux native reboot under sbin +debug "Rebooting with ${REBOOT_METHOD} to ${NEXT_SONIC_IMAGE} ..." +exec ${REBOOT_METHOD} + +# Should never reach here +error "${REBOOT_TYPE} failed!" +exit "${EXIT_FAILURE}" diff --git a/ansible/roles/test/tasks/advanced-reboot.yml b/ansible/roles/test/tasks/advanced-reboot.yml index 07974e7c86e..84585f9c1c1 100644 --- a/ansible/roles/test/tasks/advanced-reboot.yml +++ b/ansible/roles/test/tasks/advanced-reboot.yml @@ -138,8 +138,17 @@ when: new_sonic_image is defined - set_fact: - stay_in_target_image: "{{ stay_in_target_image | default('false') | bool }}" - cleanup_old_sonic_images: "{{ cleanup_old_sonic_images | default('false') | bool }}" + stay_in_target_image: "{{ stay_in_target_image | default('false') | bool }}" + cleanup_old_sonic_images: "{{ cleanup_old_sonic_images | default('false') | bool }}" + replace_fast_reboot_script: "{{ replace_fast_reboot_script | default('false') | bool }}" + allow_vlan_flooding: "{{ allow_vlan_flooding | default('false') | bool }}" + + - name: msg="Replace fast-reboot script on the DUT" + copy: + src: roles/test/files/reboot/fast-reboot + dest: /usr/bin/fast-reboot + become: yes + when: new_sonic_image is defined and replace_fast_reboot_script - include: advanced_reboot/reboot-image-handle.yml when: new_sonic_image is defined diff --git a/ansible/roles/test/tasks/ptf_runner_reboot.yml b/ansible/roles/test/tasks/ptf_runner_reboot.yml index cfe55d66424..09ac5a61c94 100644 --- a/ansible/roles/test/tasks/ptf_runner_reboot.yml +++ b/ansible/roles/test/tasks/ptf_runner_reboot.yml @@ -24,6 +24,7 @@ - arista_vms=\"['{{ vm_hosts | list | join("','") }}']\" - preboot_files='{{ preboot_files }}' - preboot_oper='{{ item }}' + - allow_vlan_flooding='{{ allow_vlan_flooding }}' always: