diff --git a/tests/common/devices/sonic.py b/tests/common/devices/sonic.py index 8bb4a399ca1..ce9f53f0d11 100644 --- a/tests/common/devices/sonic.py +++ b/tests/common/devices/sonic.py @@ -419,6 +419,42 @@ def is_supervisor_node(self): inv_files = im._sources return is_supervisor_node(inv_files, self.hostname) + def is_smartswitch(self): + """Check if the current node is a SmartSwitch + + Returns: + True if the current node is a SmartSwitch, else False + """ + config_facts = self.config_facts(host=self.hostname, source="running")['ansible_facts'] + if ( + "DEVICE_METADATA" in config_facts and + "localhost" in config_facts["DEVICE_METADATA"] and + "subtype" in config_facts["DEVICE_METADATA"]["localhost"] and + config_facts["DEVICE_METADATA"]["localhost"]["subtype"] == "SmartSwitch" and + "type" in config_facts["DEVICE_METADATA"]["localhost"] and + config_facts["DEVICE_METADATA"]["localhost"]["type"] != "SmartSwitchDPU" + ): + return True + + return False + + def is_dpu(self): + """Check if the current node is a DPU + + Returns: + True if the current node is a DPU, else False + """ + config_facts = self.config_facts(host=self.hostname, source="running")['ansible_facts'] + if ( + "DEVICE_METADATA" in config_facts and + "localhost" in config_facts["DEVICE_METADATA"] and + "type" in config_facts["DEVICE_METADATA"]["localhost"] and + config_facts["DEVICE_METADATA"]["localhost"]["type"] == "SmartSwitchDPU" + ): + return True + + return False + def is_frontend_node(self): """Check if the current node is a frontend node in case of multi-DUT. diff --git a/tests/common/reboot.py b/tests/common/reboot.py index d817920ab1c..f88e5815e87 100644 --- a/tests/common/reboot.py +++ b/tests/common/reboot.py @@ -15,7 +15,6 @@ from tests.common.helpers.dut_utils import ignore_t2_syslog_msgs, create_duthost_console, creds_on_dut from tests.common.fixtures.conn_graph_facts import get_graph_facts - logger = logging.getLogger(__name__) # Create the waiting power on event @@ -142,6 +141,26 @@ } } +''' +command : command to reboot the smartswitch DUT +''' +reboot_ss_ctrl_dict = { + REBOOT_TYPE_COLD: { + "command": "reboot", + "timeout": 300, + "wait": 120, + "cause": r"'reboot'|Non-Hardware \(reboot|^reboot", + "test_reboot_cause_only": False + }, + REBOOT_TYPE_WATCHDOG: { + "command": "watchdogutil arm -s 5", + "timeout": 300, + "wait": 120, + "cause": "Watchdog", + "test_reboot_cause_only": True + } +} + MAX_NUM_REBOOT_CAUSE_HISTORY = 10 REBOOT_TYPE_HISTOYR_QUEUE = deque([], MAX_NUM_REBOOT_CAUSE_HISTORY) REBOOT_CAUSE_HISTORY_TITLE = ["name", "cause", "time", "user", "comment"] @@ -224,6 +243,28 @@ def execute_reboot_helper(): return [reboot_res, dut_datetime] +@support_ignore_loganalyzer +def reboot_smartswitch(duthost, reboot_type=REBOOT_TYPE_COLD): + """ + reboots SmartSwitch or a DPU + :param duthost: DUT host object + :param reboot_type: reboot type (cold) + """ + + if reboot_type not in reboot_ss_ctrl_dict: + logger.info("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type)) + return + + hostname = duthost.hostname + dut_datetime = duthost.get_now_time(utc_timezone=True) + + logging.info("Rebooting the DUT {} with type {}".format(hostname, reboot_type)) + + reboot_res = duthost.command(reboot_ss_ctrl_dict[reboot_type]["command"]) + + return [reboot_res, dut_datetime] + + @support_ignore_loganalyzer def reboot(duthost, localhost, reboot_type='cold', delay=10, timeout=0, wait=0, wait_for_ssh=True, wait_warmboot_finalizer=False, warmboot_finalizer_timeout=0, @@ -284,7 +325,12 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10, console_thread_res = pool.apply_async( collect_console_log, args=(duthost, localhost, timeout + wait_conlsole_connection)) time.sleep(wait_conlsole_connection) - reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type) + # Perform reboot + if duthost.is_smartswitch(): + reboot_res, dut_datetime = reboot_smartswitch(duthost, reboot_type) + else: + reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, + reboot_kwargs, reboot_type) wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res) diff --git a/tests/platform_tests/test_reboot.py b/tests/platform_tests/test_reboot.py index 97be66e0bf8..6ccfbd5d33a 100644 --- a/tests/platform_tests/test_reboot.py +++ b/tests/platform_tests/test_reboot.py @@ -221,6 +221,9 @@ def test_fast_reboot(duthosts, enum_rand_one_per_hwsku_hostname, if duthost.is_multi_asic: pytest.skip("Multi-ASIC devices not supporting fast reboot") + if duthost.is_smartswitch(): + pytest.skip("Smart Switch devices does not support fast reboot") + reboot_and_check(localhost, duthost, conn_graph_facts.get("device_conn", {}).get(duthost.hostname, {}), xcvr_skip_list, reboot_type=REBOOT_TYPE_FAST, duthosts=duthosts) @@ -236,6 +239,9 @@ def test_warm_reboot(duthosts, enum_rand_one_per_hwsku_hostname, if duthost.is_multi_asic: pytest.skip("Multi-ASIC devices not supporting warm reboot") + if duthost.is_smartswitch(): + pytest.skip("Smart Switch devices does not support warm reboot") + asic_type = duthost.facts["asic_type"] if asic_type in ["mellanox"]: diff --git a/tests/smartswitch/common/device_utils_dpu.py b/tests/smartswitch/common/device_utils_dpu.py index 0fe8b9303d5..aefbc8815ff 100644 --- a/tests/smartswitch/common/device_utils_dpu.py +++ b/tests/smartswitch/common/device_utils_dpu.py @@ -433,7 +433,7 @@ def post_test_switch_check(duthost, localhost, return -def post_test_dpu_check(duthost, dpuhosts, dpu_name): +def post_test_dpu_check(duthost, dpuhosts, dpu_name, reboot_cause): """ Runs all required checks for a given DPU Args: @@ -464,14 +464,13 @@ def post_test_dpu_check(duthost, dpuhosts, dpu_name): logging.info(f"Checking reboot cause of {dpu_name}") pytest_assert( wait_until(REBOOT_CAUSE_TIMEOUT, REBOOT_CAUSE_INT, 0, - check_dpu_reboot_cause, duthost, dpu_name, "Non-Hardware"), + check_dpu_reboot_cause, duthost, dpu_name, reboot_cause), f"Reboot cause for DPU {dpu_name} is incorrect" ) -def post_test_dpus_check(duthost, dpuhosts, - dpu_on_list, dpu_off_list, - ip_address_list, num_dpu_modules): +def post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, + num_dpu_modules, reboot_cause): """ Checks DPU OFF/ON and reboot cause status Post Test Args: @@ -489,7 +488,7 @@ def post_test_dpus_check(duthost, dpuhosts, logging.info("Post test DPUs check in parallel") for dpu in dpu_on_list: executor.submit(post_test_dpu_check, duthost, - dpuhosts, dpu) + dpuhosts, dpu, reboot_cause) logging.info("Checking all powered on DPUs connectivity") ping_status = check_dpu_ping_status(duthost, ip_address_list) diff --git a/tests/smartswitch/common/reboot.py b/tests/smartswitch/common/reboot.py new file mode 100644 index 00000000000..9bc64a79fd2 --- /dev/null +++ b/tests/smartswitch/common/reboot.py @@ -0,0 +1,62 @@ +import logging +import pytest +from tests.common.reboot import reboot_ss_ctrl_dict as reboot_dict, REBOOT_TYPE_HISTOYR_QUEUE, \ + sync_reboot_history_queue_with_dut + +logger = logging.getLogger(__name__) + +REBOOT_TYPE_COLD = "cold" +REBOOT_TYPE_UNKNOWN = "unknown" +REBOOT_TYPE_KERNEL_PANIC = "Kernel Panic" +REBOOT_TYPE_WATCHDOG = "Watchdog" + + +def log_and_perform_reboot(duthost, reboot_type, dpu_name): + """ + Logs and initiates the reboot process based on the host type. + Skips the test if the host is a DPU. + + @param duthost: DUT host object + @param reboot_type: Type of reboot to perform + @param dpu_name: Name of the DPU (optional) + """ + hostname = duthost.hostname + + if reboot_type == REBOOT_TYPE_COLD: + if duthost.is_smartswitch(): + if dpu_name is None: + logger.info("Sync reboot cause history queue with DUT reboot cause history queue") + sync_reboot_history_queue_with_dut(hostname) + + logger.info("Rebooting the switch {} with type {}".format(hostname, reboot_type)) + return duthost.command("sudo reboot") + else: + logger.info("Rebooting the DPU {} with type {}".format(dpu_name, reboot_type)) + return duthost.command("sudo reboot -d {}".format(dpu_name)) + elif duthost.is_dpu(): + pytest.skip("Skipping the reboot test as the DUT is a DPU") + else: + pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type)) + + +def perform_reboot(duthost, reboot_type=REBOOT_TYPE_COLD, dpu_name=None): + """ + Performs a reboot and validates the DPU status after reboot. + + @param duthost: DUT host object + @param reboot_type: Reboot type + @param dpu_name: DPU name + """ + if reboot_type not in reboot_dict: + pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type)) + + res = log_and_perform_reboot(duthost, reboot_type, dpu_name) + if res['failed'] is True: + if dpu_name is None: + pytest.fail("Failed to reboot the {} with type {}".format(duthost.hostname, reboot_type)) + else: + pytest.fail("Failed to reboot the DPU {} with type {}".format(dpu_name, reboot_type)) + + if dpu_name is None: + logger.info("Appending the last reboot type to the queue") + REBOOT_TYPE_HISTOYR_QUEUE.append(reboot_type) diff --git a/tests/smartswitch/platform_tests/test_platform_dpu.py b/tests/smartswitch/platform_tests/test_platform_dpu.py index b7deb660830..bb5c001de77 100644 --- a/tests/smartswitch/platform_tests/test_platform_dpu.py +++ b/tests/smartswitch/platform_tests/test_platform_dpu.py @@ -113,9 +113,7 @@ def test_pcie_link(duthosts, dpuhosts, duthost.shell("sudo config chassis modules \ startup %s" % (dpu_on_list[index])) - post_test_dpus_check(duthost, dpuhosts, - dpu_on_list, dpu_off_list, - ip_address_list, num_dpu_modules) + post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware") logging.info("Verifying output of '{}' on '{}'..." .format(CMD_PCIE_INFO, duthost.hostname)) @@ -297,9 +295,8 @@ def test_system_health_summary(duthosts, dpuhosts, num_dpu_modules) logging.info("Checking DPU is completely UP") - post_test_dpus_check(duthost, dpuhosts, - dpu_on_list, dpu_off_list, - ip_address_list, num_dpu_modules) + post_test_dpus_check(duthost, dpuhosts, dpu_on_list, + ip_address_list, num_dpu_modules, "Non-Hardware") logging.info("Checking show system-health summary on Switch") output_health_summary = duthost.command("show system-health summary") diff --git a/tests/smartswitch/platform_tests/test_reload_dpu.py b/tests/smartswitch/platform_tests/test_reload_dpu.py index 0d24038c8cb..04670a0b2c1 100644 --- a/tests/smartswitch/platform_tests/test_reload_dpu.py +++ b/tests/smartswitch/platform_tests/test_reload_dpu.py @@ -7,11 +7,13 @@ import re from tests.common.platform.processes_utils import wait_critical_processes from tests.common.reboot import reboot, REBOOT_TYPE_COLD -from tests.smartswitch.common.device_utils_dpu import get_dpu_link_status,\ - check_dpu_ping_status, check_dpu_link_and_status, check_dpu_module_status,\ +from tests.common.helpers.platform_api import module +from tests.smartswitch.common.device_utils_dpu import check_dpu_link_and_status,\ pre_test_check, post_test_switch_check, post_test_dpus_check,\ - check_dpu_reboot_cause, num_dpu_modules # noqa: F401 + num_dpu_modules # noqa: F401 from tests.common.platform.device_utils import platform_api_conn, start_platform_api_service # noqa: F401,F403 +from tests.smartswitch.common.reboot import perform_reboot +from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor pytestmark = [ pytest.mark.topology('smartswitch') @@ -153,9 +155,7 @@ def test_dpu_status_post_dpu_kernel_panic(duthosts, dpuhosts, dpuhosts[dpu_id].shell(kernel_panic_cmd, executable="/bin/bash") logging.info("Executing post test dpu check") - post_test_dpus_check(duthost, dpuhosts, - dpu_on_list, dpu_off_list, - ip_address_list, num_dpu_modules) + post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware") def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts, @@ -182,6 +182,70 @@ def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts, dpuhosts[dpu_id].shell(memory_exhaustion_cmd, executable="/bin/bash") logging.info("Executing post test dpu check") - post_test_dpus_check(duthost, dpuhosts, - dpu_on_list, dpu_off_list, - ip_address_list, num_dpu_modules) + post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, + num_dpu_modules, "Non-Hardware") + + +def test_cold_reboot_dpus(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname, + platform_api_conn, num_dpu_modules): # noqa: F811, E501 + """ + Test to cold reboot all DPUs in the DUT. + Steps: + 1. Perform pre-test checks to gather DPU state. + 2. Initiate cold reboot on all DPUs concurrently. + 3. Perform post-test checks to verify the state after reboot. + + Args: + duthosts: DUT hosts object + dpuhosts: DPU hosts object + enum_rand_one_per_hwsku_hostname: Randomized DUT hostname + platform_api_conn: Platform API connection object + num_dpu_modules: Number of DPU modules to reboot + """ + duthost = duthosts[enum_rand_one_per_hwsku_hostname] + + logging.info("Executing pre test check") + ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules) + + def reboot_dpu(duthost, platform_api_conn, index): + try: + dpu_name = module.get_name(platform_api_conn, index) + perform_reboot(duthost, REBOOT_TYPE_COLD, dpu_name) + except Exception as e: + logging.error(f"Failed to reboot DPU at index {index}: {e}") + + with SafeThreadPoolExecutor(max_workers=num_dpu_modules) as executor: + logging.info("Rebooting all DPUs in parallel") + for index in range(num_dpu_modules): + executor.submit(reboot_dpu, duthost, platform_api_conn, index) + + logging.info("Executing post test dpu check") + post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware") + + +def test_cold_reboot_switch(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname, + platform_api_conn, num_dpu_modules): # noqa: F811, E501 + """ + Test to cold reboot the switch in the DUT. + Steps: + 1. Perform pre-test checks to gather DPU state. + 2. Initiate a cold reboot on the switch. + 3. Perform post-test checks to verify the state of DPUs after the reboot. + + Args: + duthosts: DUT hosts object + dpuhosts: DPU hosts object + enum_rand_one_per_hwsku_hostname: Randomized DUT hostname + platform_api_conn: Platform API connection object + num_dpu_modules: Number of DPU modules to verify + """ + duthost = duthosts[enum_rand_one_per_hwsku_hostname] + + logging.info("Executing pre test check") + ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules) + + logging.info("Starting switch reboot...") + perform_reboot(duthost, REBOOT_TYPE_COLD, None) + + logging.info("Executing post switch reboot dpu check") + post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "reboot")