-
Notifications
You must be signed in to change notification settings - Fork 1k
Add new test case for container based warm restart #6054
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2de80cf
2b2c0f3
4c8f3b0
6b8f648
99a87c3
b96b2fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -253,6 +253,17 @@ def __init__(self): | |
| self.kvm_test = True | ||
| else: | ||
| self.kvm_test = False | ||
| if "service-warm-restart" in self.test_params['reboot_type']: | ||
| self.check_param('service_list', None, required=True) | ||
| self.check_param('service_data', None, required=True) | ||
| self.service_data = self.test_params['service_data'] | ||
| for service_name in self.test_params['service_list']: | ||
| cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name) | ||
| stdout, _, _ = self.dut_connection.execCommand(cmd) | ||
| if service_name not in self.service_data: | ||
| self.service_data[service_name] = {} | ||
| self.service_data[service_name]['service_start_time'] = str(stdout[0]).strip() | ||
| self.log("Service start time for {} is {}".format(service_name, self.service_data[service_name]['service_start_time'])) | ||
| return | ||
|
|
||
| def read_json(self, name): | ||
|
|
@@ -437,7 +448,7 @@ def build_vlan_if_port_mapping(self): | |
| portchannel_names = [pc['name'] for pc in portchannel_content.values()] | ||
|
|
||
| vlan_content = self.read_json('vlan_ports_file') | ||
|
|
||
| vlan_if_port = [] | ||
| for vlan in self.vlan_ip_range: | ||
| for ifname in vlan_content[vlan]['members']: | ||
|
|
@@ -926,6 +937,31 @@ def wait_until_control_plane_up(self): | |
| self.no_control_stop = datetime.datetime.now() | ||
| self.log("Dut reboots: control plane up at %s" % str(self.no_control_stop)) | ||
|
|
||
| def wait_until_service_restart(self): | ||
| self.log("Wait until sevice restart") | ||
| self.reboot_start = datetime.datetime.now() | ||
| service_set = set(self.test_params['service_list']) | ||
| wait_time = 120 | ||
| while wait_time > 0: | ||
| for service_name in self.test_params['service_list']: | ||
| if service_name not in service_set: | ||
| continue | ||
| cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name) | ||
| stdout, _, _ = self.dut_connection.execCommand(cmd) | ||
| if self.service_data[service_name]['service_start_time'] != str(stdout[0]).strip(): | ||
| service_set.remove(service_name) | ||
| if not service_set: | ||
| break | ||
| wait_time -= 10 | ||
| time.sleep(10) | ||
|
|
||
| if service_set: | ||
| self.fails['dut'].add("Container {} hasn't come back up in {} seconds".format(','.join(service_set), wait_time)) | ||
| raise TimeoutError | ||
|
|
||
| # TODO: add timestamp | ||
| self.log("Service has restarted") | ||
|
|
||
| def handle_fast_reboot_health_check(self): | ||
| self.log("Check that device is still forwarding data plane traffic") | ||
| self.fails['dut'].add("Data plane has a forwarding problem after CPU went down") | ||
|
|
@@ -1017,6 +1053,10 @@ def wait_for_ssh_threads(signal): | |
| # verify there are no interface flaps after warm boot | ||
| self.neigh_lag_status_check() | ||
|
|
||
| if 'service-warm-restart' == self.reboot_type: | ||
| # verify there are no interface flaps after warm boot | ||
| self.neigh_lag_status_check() | ||
|
|
||
| def handle_advanced_reboot_health_check_kvm(self): | ||
| self.log("Wait until data plane stops") | ||
| forward_stop_signal = multiprocessing.Event() | ||
|
|
@@ -1193,8 +1233,11 @@ def runTest(self): | |
| thr = threading.Thread(target=self.reboot_dut) | ||
| thr.setDaemon(True) | ||
| thr.start() | ||
| self.wait_until_control_plane_down() | ||
| self.no_control_start = self.cpu_state.get_state_time('down') | ||
| if self.reboot_type != 'service-warm-restart': | ||
| self.wait_until_control_plane_down() | ||
| self.no_control_start = self.cpu_state.get_state_time('down') | ||
| else: | ||
| self.wait_until_service_restart() | ||
|
|
||
| if 'warm-reboot' in self.reboot_type: | ||
| finalizer_timeout = 60 + self.test_params['reboot_limit_in_seconds'] | ||
|
|
@@ -1210,7 +1253,7 @@ def runTest(self): | |
| else: | ||
| if self.reboot_type == 'fast-reboot': | ||
| self.handle_fast_reboot_health_check() | ||
| if 'warm-reboot' in self.reboot_type: | ||
| if 'warm-reboot' in self.reboot_type or 'service-warm-restart' == self.reboot_type: | ||
| self.handle_warm_reboot_health_check() | ||
| self.handle_post_reboot_health_check() | ||
|
|
||
|
|
@@ -1276,15 +1319,20 @@ def reboot_dut(self): | |
| time.sleep(self.reboot_delay) | ||
|
|
||
| if not self.kvm_test and\ | ||
| (self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type): | ||
| (self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type): | ||
| self.sender_thr = threading.Thread(target = self.send_in_background) | ||
| self.sniff_thr = threading.Thread(target = self.sniff_in_background) | ||
| self.sniffer_started = threading.Event() # Event for the sniff_in_background status. | ||
| self.sniff_thr.start() | ||
| self.sender_thr.start() | ||
|
|
||
| self.log("Rebooting remote side") | ||
| stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30) | ||
| if self.reboot_type != 'service-warm-restart': | ||
| stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30) | ||
| else: | ||
| self.restart_service() | ||
| return | ||
|
|
||
| if stdout != []: | ||
| self.log("stdout from %s: %s" % (self.reboot_type, str(stdout))) | ||
| if stderr != []: | ||
|
|
@@ -1300,6 +1348,42 @@ def reboot_dut(self): | |
|
|
||
| return | ||
|
|
||
| def restart_service(self): | ||
| for service_name in self.test_params['service_list']: | ||
| if 'image_path_on_dut' in self.service_data[service_name]: | ||
| stdout, stderr, return_code = self.dut_connection.execCommand("sudo sonic-installer upgrade-docker {} {} -y --warm".format(service_name, self.service_data[service_name]['image_path_on_dut']), timeout=30) | ||
| else: | ||
| self.dut_connection.execCommand('sudo config warm_restart enable {}'.format(service_name)) | ||
| self.pre_service_warm_restart(service_name) | ||
| stdout, stderr, return_code = self.dut_connection.execCommand('sudo service {} restart'.format(service_name)) | ||
|
|
||
| if stdout != []: | ||
| self.log("stdout from %s %s: %s" % (self.reboot_type, service_name, str(stdout))) | ||
| if stderr != []: | ||
| self.log("stderr from %s %s: %s" % (self.reboot_type, service_name, str(stderr))) | ||
| self.fails['dut'].add("service warm restart {} failed with error {}".format(service_name, stderr)) | ||
| thread.interrupt_main() | ||
| raise Exception("{} failed with error {}".format(self.reboot_type, stderr)) | ||
| self.log("return code from %s %s: %s" % (self.reboot_type, service_name, str(return_code))) | ||
| if return_code not in [0, 255]: | ||
| thread.interrupt_main() | ||
|
|
||
| def pre_service_warm_restart(self, service_name): | ||
| """Copy from src/sonic-utilities/sonic_installer/main.py to do some special operation for particular containers | ||
| """ | ||
| if service_name == 'swss': | ||
| cmd = 'docker exec -i swss orchagent_restart_check -w 2000 -r 5' | ||
| stdout, stderr, return_code = self.dut_connection.execCommand(cmd) | ||
| if return_code != 0: | ||
| self.log('stdout from {}: {}'.format(cmd, str(stdout))) | ||
| self.log('stderr from {}: {}'.format(cmd, str(stderr))) | ||
| self.log('orchagent is not in clean state, RESTARTCHECK failed: {}'.format(return_code)) | ||
Junchao-Mellanox marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| elif service_name == 'bgp': | ||
| self.dut_connection.execCommand('docker exec -i bgp pkill -9 zebra') | ||
| self.dut_connection.execCommand('docker exec -i bgp pkill -9 bgpd') | ||
| elif service_name == 'teamd': | ||
| self.dut_connection.execCommand('docker exec -i teamd pkill -USR1 teamd > /dev/null') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about syncd warm restart? In which case, there needs to be syncd preshutdown?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I know, syncd warm restart is not supported. |
||
|
|
||
| def cmd(self, cmds): | ||
| process = subprocess.Popen(cmds, | ||
| shell=False, | ||
|
|
@@ -1325,7 +1409,7 @@ def peer_state_check(self, ip, queue): | |
| lacp_pdu_down_times and len(lacp_pdu_down_times) > 0 else None | ||
| lacp_pdu_after_reboot = float(lacp_pdu_up_times[0]) if\ | ||
| lacp_pdu_up_times and len(lacp_pdu_up_times) > 0 else None | ||
| if 'warm-reboot' in self.reboot_type and lacp_pdu_before_reboot and lacp_pdu_after_reboot: | ||
| if ('warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type) and lacp_pdu_before_reboot and lacp_pdu_after_reboot: | ||
| lacp_time_diff = lacp_pdu_after_reboot - lacp_pdu_before_reboot | ||
| if lacp_time_diff >= 90 and not self.kvm_test: | ||
| self.fails['dut'].add("LACP session likely terminated by neighbor ({})".format(ip) +\ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,7 +42,7 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs | |
| @param tbinfo: fixture provides information about testbed | ||
| @param kwargs: extra parameters including reboot type | ||
| ''' | ||
| assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType']) , ( | ||
| assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType'] or 'service-warm-restart' in kwargs['rebootType']) , ( | ||
| "Please set rebootType var." | ||
| ) | ||
|
|
||
|
|
@@ -90,6 +90,10 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs | |
|
|
||
| self.__buildTestbedData(tbinfo) | ||
|
|
||
| if self.rebootType == 'service-warm-restart': | ||
| assert hasattr(self, 'service_list') | ||
| self.service_data = {} | ||
|
|
||
| def __extractTestParam(self): | ||
| ''' | ||
| Extract test parameters from pytest request object. Note that all the parameters have default values. | ||
|
|
@@ -106,6 +110,7 @@ def __extractTestParam(self): | |
| self.replaceFastRebootScript = self.request.config.getoption("--replace_fast_reboot_script") | ||
| self.postRebootCheckScript = self.request.config.getoption("--post_reboot_check_script") | ||
| self.bgpV4V6TimeDiff = self.request.config.getoption("--bgp_v4_v6_time_diff") | ||
| self.new_docker_image = self.request.config.getoption("--new_docker_image") | ||
|
|
||
| # Set default reboot limit if it is not given | ||
| if self.rebootLimit is None: | ||
|
|
@@ -326,6 +331,35 @@ def __handleRebootImage(self): | |
| logger.info('Remove downloaded tempfile') | ||
| self.duthost.shell('rm -f {}'.format(tempfile)) | ||
|
|
||
| def __handleDockerImage(self): | ||
| """Download and install docker image to DUT | ||
| """ | ||
| if self.new_docker_image is None: | ||
| return | ||
|
|
||
| for service_name in self.service_list: | ||
| data = {} | ||
| docker_image_name = self.duthost.shell('docker ps | grep {} | awk \'{{print $2}}\''.format(service_name))['stdout'] | ||
| cmd = 'docker images {} --format \{{\{{.ID\}}\}}'.format(docker_image_name) | ||
| data['image_id'] = self.duthost.shell(cmd)['stdout'] | ||
| data['image_name'], data['image_tag'] = docker_image_name.split(':') | ||
|
|
||
| local_image_path = '/tmp/{}.gz'.format(data['image_name']) | ||
| logger.info('Downloading new docker image for {} to {}'.format(service_name, local_image_path)) | ||
| output = self.localhost.shell('curl --silent --write-out "%{{http_code}}" {0}/{1}.gz --output {2}'.format(self.new_docker_image, data['image_name'], local_image_path), module_ignore_errors=True)['stdout'] | ||
| if '404' not in output and os.path.exists(local_image_path): | ||
| temp_file = self.duthost.shell('mktemp')['stdout'] | ||
| self.duthost.copy(src=local_image_path, dest=temp_file) | ||
| self.localhost.shell('rm -f /tmp/{}.gz'.format(data['image_name'])) | ||
| data['image_path_on_dut'] = temp_file | ||
| logger.info('Successfully downloaded docker image, will perform in-service upgrade') | ||
| else: | ||
| data['image_path_on_dut'] = None | ||
| logger.info('Docker image not found, will perform in-service reboot') | ||
| self.service_data[service_name] = data | ||
|
|
||
| logger.info('service data = {}'.format(json.dumps(self.service_data, indent=2))) | ||
|
|
||
| def __setupTestbed(self): | ||
| ''' | ||
| Sets testbed up. It tranfers test data files, ARP responder, and runs script to update IPs and MAC addresses. | ||
|
|
@@ -438,7 +472,10 @@ def imageInstall(self, prebootList=None, inbootList=None, prebootFiles=None): | |
| self.__setupTestbed() | ||
|
|
||
| # Download and install new sonic image | ||
| self.__handleRebootImage() | ||
| if self.rebootType != 'service-warm-restart': | ||
| self.__handleRebootImage() | ||
| else: | ||
| self.__handleDockerImage() | ||
|
|
||
| # Handle mellanox platform | ||
| self.__handleMellanoxDut() | ||
|
|
@@ -586,7 +623,9 @@ def __runPtfRunner(self, rebootOper=None): | |
| "asic_type": self.duthost.facts["asic_type"], | ||
| "allow_mac_jumping": self.allowMacJump, | ||
| "preboot_files" : self.prebootFiles, | ||
| "alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword") | ||
| "alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword"), | ||
| "service_list": None if self.rebootType != 'service-warm-restart' else self.service_list, | ||
| "service_data": None if self.rebootType != 'service-warm-restart' else self.service_data, | ||
| } | ||
|
|
||
| if not isinstance(rebootOper, SadOperation): | ||
|
|
@@ -639,6 +678,36 @@ def __restorePrevImage(self): | |
| wait = self.readyTimeout | ||
| ) | ||
|
|
||
| def __restorePrevDockerImage(self): | ||
| """Restore previous docker image. | ||
| """ | ||
| for service_name, data in self.service_data.items(): | ||
| if data['image_path_on_dut'] is None: | ||
| self.duthost.shell('sudo config warm_restart disable {}'.format(service_name)) | ||
| continue | ||
|
|
||
| # We don't use sonic-installer rollback-docker CLI here because: | ||
| # 1. it does not remove the docker image which causes the running container still using the old image | ||
| # 2. it runs service restart for some containers which enlarge the test duration | ||
| self.duthost.shell('rm -f {}'.format(data['image_path_on_dut'])) | ||
| logger.info('Restore docker image for {}'.format(service_name)) | ||
| self.duthost.shell('service {} stop'.format(service_name)) | ||
| self.duthost.shell('docker rm {}'.format(service_name)) | ||
| image_ids = self.duthost.shell('docker images {} --format \{{\{{.ID\}}\}}'.format(data['image_name']))['stdout_lines'] | ||
| for image_id in image_ids: | ||
| if image_id != data['image_id']: | ||
| self.duthost.shell('docker rmi -f {}'.format(image_id)) | ||
| break | ||
|
|
||
| self.duthost.shell('docker tag {} {}:{}'.format(data['image_id'], data['image_name'], data['image_tag'])) | ||
|
|
||
| rebootDut( | ||
| self.duthost, | ||
| self.localhost, | ||
| reboot_type='cold', | ||
| wait = 300 | ||
| ) | ||
|
|
||
| def tearDown(self): | ||
| ''' | ||
| Tears down test case. It also verifies that config_db.json exists. | ||
|
|
@@ -658,7 +727,13 @@ def tearDown(self): | |
| self.__runScript([self.postRebootCheckScript], self.duthost) | ||
|
|
||
| if not self.stayInTargetImage: | ||
| self.__restorePrevImage() | ||
| logger.info('Restoring previous image') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| if self.rebootType != 'service-warm-restart': | ||
| self.__restorePrevImage() | ||
| else: | ||
| self.__restorePrevDockerImage() | ||
| else: | ||
| logger.info('Stay in new image') | ||
|
|
||
| @pytest.fixture | ||
| def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo, creds): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.