-
Notifications
You must be signed in to change notification settings - Fork 1k
[SmartSwitch] Add tests for reboot of a smart switch #16566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e0275da
d984657
69c103b
c3c4cd0
9dc360b
33a4cf0
8cb8420
dfbd6ab
54670c5
1e1964d
0c718ce
d63b366
a5903cf
36270b4
0a63a45
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| import logging | ||
| import pytest | ||
| from tests.common.reboot import reboot_ss_ctrl_dict as reboot_dict, REBOOT_TYPE_HISTOYR_QUEUE, \ | ||
vvolam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| sync_reboot_history_queue_with_dut | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| REBOOT_TYPE_COLD = "cold" | ||
| REBOOT_TYPE_UNKNOWN = "unknown" | ||
| REBOOT_TYPE_KERNEL_PANIC = "Kernel Panic" | ||
| REBOOT_TYPE_WATCHDOG = "Watchdog" | ||
|
|
||
|
|
||
| def log_and_perform_reboot(duthost, reboot_type, dpu_name): | ||
| """ | ||
| Logs and initiates the reboot process based on the host type. | ||
| Skips the test if the host is a DPU. | ||
|
|
||
| @param duthost: DUT host object | ||
| @param reboot_type: Type of reboot to perform | ||
| @param dpu_name: Name of the DPU (optional) | ||
| """ | ||
| hostname = duthost.hostname | ||
|
|
||
| if reboot_type == REBOOT_TYPE_COLD: | ||
| if duthost.is_smartswitch(): | ||
vvolam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if dpu_name is None: | ||
| logger.info("Sync reboot cause history queue with DUT reboot cause history queue") | ||
| sync_reboot_history_queue_with_dut(hostname) | ||
|
|
||
| logger.info("Rebooting the switch {} with type {}".format(hostname, reboot_type)) | ||
| return duthost.command("sudo reboot") | ||
| else: | ||
| logger.info("Rebooting the DPU {} with type {}".format(dpu_name, reboot_type)) | ||
| return duthost.command("sudo reboot -d {}".format(dpu_name)) | ||
| elif duthost.is_dpu(): | ||
| pytest.skip("Skipping the reboot test as the DUT is a DPU") | ||
| else: | ||
| pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type)) | ||
|
|
||
|
|
||
| def perform_reboot(duthost, reboot_type=REBOOT_TYPE_COLD, dpu_name=None): | ||
| """ | ||
| Performs a reboot and validates the DPU status after reboot. | ||
|
|
||
| @param duthost: DUT host object | ||
| @param reboot_type: Reboot type | ||
| @param dpu_name: DPU name | ||
| """ | ||
| if reboot_type not in reboot_dict: | ||
| pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type)) | ||
|
|
||
| res = log_and_perform_reboot(duthost, reboot_type, dpu_name) | ||
| if res['failed'] is True: | ||
| if dpu_name is None: | ||
| pytest.fail("Failed to reboot the {} with type {}".format(duthost.hostname, reboot_type)) | ||
| else: | ||
| pytest.fail("Failed to reboot the DPU {} with type {}".format(dpu_name, reboot_type)) | ||
|
|
||
| if dpu_name is None: | ||
| logger.info("Appending the last reboot type to the queue") | ||
| REBOOT_TYPE_HISTOYR_QUEUE.append(reboot_type) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,11 +7,13 @@ | |
| import re | ||
| from tests.common.platform.processes_utils import wait_critical_processes | ||
| from tests.common.reboot import reboot, REBOOT_TYPE_COLD | ||
| from tests.smartswitch.common.device_utils_dpu import get_dpu_link_status,\ | ||
| check_dpu_ping_status, check_dpu_link_and_status, check_dpu_module_status,\ | ||
| from tests.common.helpers.platform_api import module | ||
| from tests.smartswitch.common.device_utils_dpu import check_dpu_link_and_status,\ | ||
| pre_test_check, post_test_switch_check, post_test_dpus_check,\ | ||
| check_dpu_reboot_cause, num_dpu_modules # noqa: F401 | ||
| num_dpu_modules # noqa: F401 | ||
| from tests.common.platform.device_utils import platform_api_conn, start_platform_api_service # noqa: F401,F403 | ||
| from tests.smartswitch.common.reboot import perform_reboot | ||
| from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor | ||
|
|
||
| pytestmark = [ | ||
| pytest.mark.topology('smartswitch') | ||
|
|
@@ -153,9 +155,7 @@ def test_dpu_status_post_dpu_kernel_panic(duthosts, dpuhosts, | |
| dpuhosts[dpu_id].shell(kernel_panic_cmd, executable="/bin/bash") | ||
|
|
||
| logging.info("Executing post test dpu check") | ||
| post_test_dpus_check(duthost, dpuhosts, | ||
| dpu_on_list, dpu_off_list, | ||
| ip_address_list, num_dpu_modules) | ||
| post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware") | ||
|
|
||
|
|
||
| def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts, | ||
|
|
@@ -182,6 +182,70 @@ def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts, | |
| dpuhosts[dpu_id].shell(memory_exhaustion_cmd, executable="/bin/bash") | ||
|
|
||
| logging.info("Executing post test dpu check") | ||
| post_test_dpus_check(duthost, dpuhosts, | ||
| dpu_on_list, dpu_off_list, | ||
| ip_address_list, num_dpu_modules) | ||
| post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, | ||
| num_dpu_modules, "Non-Hardware") | ||
|
|
||
|
|
||
| def test_cold_reboot_dpus(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname, | ||
| platform_api_conn, num_dpu_modules): # noqa: F811, E501 | ||
| """ | ||
| Test to cold reboot all DPUs in the DUT. | ||
| Steps: | ||
| 1. Perform pre-test checks to gather DPU state. | ||
| 2. Initiate cold reboot on all DPUs concurrently. | ||
| 3. Perform post-test checks to verify the state after reboot. | ||
|
|
||
| Args: | ||
| duthosts: DUT hosts object | ||
| dpuhosts: DPU hosts object | ||
| enum_rand_one_per_hwsku_hostname: Randomized DUT hostname | ||
| platform_api_conn: Platform API connection object | ||
| num_dpu_modules: Number of DPU modules to reboot | ||
| """ | ||
| duthost = duthosts[enum_rand_one_per_hwsku_hostname] | ||
|
|
||
| logging.info("Executing pre test check") | ||
| ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules) | ||
|
|
||
| def reboot_dpu(duthost, platform_api_conn, index): | ||
| try: | ||
| dpu_name = module.get_name(platform_api_conn, index) | ||
| perform_reboot(duthost, REBOOT_TYPE_COLD, dpu_name) | ||
| except Exception as e: | ||
| logging.error(f"Failed to reboot DPU at index {index}: {e}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @vvolam , I just found that all exceptions here in rebooting the DPU is ignored, there is only one error printed in the test log. Is this expected test behavior?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @congh-nvidia, do you mean we should differentiate the logging based on the type of exception encountered here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @vvolam , I mean the exceptions are not raised, the test only log an error. If the reboot command somehow failed and there is no actual reboot, the post check could still pass and the test case passes.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @vvolam, @nikamirrr will raise a PR for the fix along with another fix in this test.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @congh-nvidia sounds good. Thank you! |
||
|
|
||
| with SafeThreadPoolExecutor(max_workers=num_dpu_modules) as executor: | ||
| logging.info("Rebooting all DPUs in parallel") | ||
| for index in range(num_dpu_modules): | ||
| executor.submit(reboot_dpu, duthost, platform_api_conn, index) | ||
|
|
||
| logging.info("Executing post test dpu check") | ||
| post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware") | ||
|
|
||
|
|
||
| def test_cold_reboot_switch(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname, | ||
| platform_api_conn, num_dpu_modules): # noqa: F811, E501 | ||
| """ | ||
| Test to cold reboot the switch in the DUT. | ||
| Steps: | ||
| 1. Perform pre-test checks to gather DPU state. | ||
| 2. Initiate a cold reboot on the switch. | ||
| 3. Perform post-test checks to verify the state of DPUs after the reboot. | ||
|
|
||
| Args: | ||
| duthosts: DUT hosts object | ||
| dpuhosts: DPU hosts object | ||
| enum_rand_one_per_hwsku_hostname: Randomized DUT hostname | ||
| platform_api_conn: Platform API connection object | ||
| num_dpu_modules: Number of DPU modules to verify | ||
| """ | ||
| duthost = duthosts[enum_rand_one_per_hwsku_hostname] | ||
|
|
||
| logging.info("Executing pre test check") | ||
| ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules) | ||
|
|
||
| logging.info("Starting switch reboot...") | ||
| perform_reboot(duthost, REBOOT_TYPE_COLD, None) | ||
|
|
||
| logging.info("Executing post switch reboot dpu check") | ||
| post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "reboot") | ||
Uh oh!
There was an error while loading. Please reload this page.