Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions tests/common/devices/sonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,42 @@ def is_supervisor_node(self):
inv_files = im._sources
return is_supervisor_node(inv_files, self.hostname)

def is_smartswitch(self):
"""Check if the current node is a SmartSwitch

Returns:
True if the current node is a SmartSwitch, else False
"""
config_facts = self.config_facts(host=self.hostname, source="running")['ansible_facts']
if (
"DEVICE_METADATA" in config_facts and
"localhost" in config_facts["DEVICE_METADATA"] and
"subtype" in config_facts["DEVICE_METADATA"]["localhost"] and
config_facts["DEVICE_METADATA"]["localhost"]["subtype"] == "SmartSwitch" and
"type" in config_facts["DEVICE_METADATA"]["localhost"] and
config_facts["DEVICE_METADATA"]["localhost"]["type"] != "SmartSwitchDPU"
):
return True

return False

def is_dpu(self):
"""Check if the current node is a DPU

Returns:
True if the current node is a DPU, else False
"""
config_facts = self.config_facts(host=self.hostname, source="running")['ansible_facts']
if (
"DEVICE_METADATA" in config_facts and
"localhost" in config_facts["DEVICE_METADATA"] and
"type" in config_facts["DEVICE_METADATA"]["localhost"] and
config_facts["DEVICE_METADATA"]["localhost"]["type"] == "SmartSwitchDPU"
):
return True

return False

def is_frontend_node(self):
"""Check if the current node is a frontend node in case of multi-DUT.

Expand Down
50 changes: 48 additions & 2 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from tests.common.helpers.dut_utils import ignore_t2_syslog_msgs, create_duthost_console, creds_on_dut
from tests.common.fixtures.conn_graph_facts import get_graph_facts


logger = logging.getLogger(__name__)

# Create the waiting power on event
Expand Down Expand Up @@ -142,6 +141,26 @@
}
}

'''
command : command to reboot the smartswitch DUT
'''
reboot_ss_ctrl_dict = {
REBOOT_TYPE_COLD: {
"command": "reboot",
"timeout": 300,
"wait": 120,
"cause": r"'reboot'|Non-Hardware \(reboot|^reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_WATCHDOG: {
"command": "watchdogutil arm -s 5",
"timeout": 300,
"wait": 120,
"cause": "Watchdog",
"test_reboot_cause_only": True
}
}

MAX_NUM_REBOOT_CAUSE_HISTORY = 10
REBOOT_TYPE_HISTOYR_QUEUE = deque([], MAX_NUM_REBOOT_CAUSE_HISTORY)
REBOOT_CAUSE_HISTORY_TITLE = ["name", "cause", "time", "user", "comment"]
Expand Down Expand Up @@ -224,6 +243,28 @@ def execute_reboot_helper():
return [reboot_res, dut_datetime]


@support_ignore_loganalyzer
def reboot_smartswitch(duthost, reboot_type=REBOOT_TYPE_COLD):
"""
reboots SmartSwitch or a DPU
:param duthost: DUT host object
:param reboot_type: reboot type (cold)
"""

if reboot_type not in reboot_ss_ctrl_dict:
logger.info("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type))
return

hostname = duthost.hostname
dut_datetime = duthost.get_now_time(utc_timezone=True)

logging.info("Rebooting the DUT {} with type {}".format(hostname, reboot_type))

reboot_res = duthost.command(reboot_ss_ctrl_dict[reboot_type]["command"])

return [reboot_res, dut_datetime]


@support_ignore_loganalyzer
def reboot(duthost, localhost, reboot_type='cold', delay=10,
timeout=0, wait=0, wait_for_ssh=True, wait_warmboot_finalizer=False, warmboot_finalizer_timeout=0,
Expand Down Expand Up @@ -284,7 +325,12 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
console_thread_res = pool.apply_async(
collect_console_log, args=(duthost, localhost, timeout + wait_conlsole_connection))
time.sleep(wait_conlsole_connection)
reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type)
# Perform reboot
if duthost.is_smartswitch():
reboot_res, dut_datetime = reboot_smartswitch(duthost, reboot_type)
else:
reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper,
reboot_kwargs, reboot_type)

wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res)

Expand Down
6 changes: 6 additions & 0 deletions tests/platform_tests/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ def test_fast_reboot(duthosts, enum_rand_one_per_hwsku_hostname,
if duthost.is_multi_asic:
pytest.skip("Multi-ASIC devices not supporting fast reboot")

if duthost.is_smartswitch():
pytest.skip("Smart Switch devices does not support fast reboot")

reboot_and_check(localhost, duthost, conn_graph_facts.get("device_conn", {}).get(duthost.hostname, {}),
xcvr_skip_list, reboot_type=REBOOT_TYPE_FAST, duthosts=duthosts)

Expand All @@ -236,6 +239,9 @@ def test_warm_reboot(duthosts, enum_rand_one_per_hwsku_hostname,
if duthost.is_multi_asic:
pytest.skip("Multi-ASIC devices not supporting warm reboot")

if duthost.is_smartswitch():
pytest.skip("Smart Switch devices does not support warm reboot")

asic_type = duthost.facts["asic_type"]

if asic_type in ["mellanox"]:
Expand Down
11 changes: 5 additions & 6 deletions tests/smartswitch/common/device_utils_dpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def post_test_switch_check(duthost, localhost,
return


def post_test_dpu_check(duthost, dpuhosts, dpu_name):
def post_test_dpu_check(duthost, dpuhosts, dpu_name, reboot_cause):
"""
Runs all required checks for a given DPU
Args:
Expand Down Expand Up @@ -464,14 +464,13 @@ def post_test_dpu_check(duthost, dpuhosts, dpu_name):
logging.info(f"Checking reboot cause of {dpu_name}")
pytest_assert(
wait_until(REBOOT_CAUSE_TIMEOUT, REBOOT_CAUSE_INT, 0,
check_dpu_reboot_cause, duthost, dpu_name, "Non-Hardware"),
check_dpu_reboot_cause, duthost, dpu_name, reboot_cause),
f"Reboot cause for DPU {dpu_name} is incorrect"
)


def post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules):
def post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list,
num_dpu_modules, reboot_cause):
"""
Checks DPU OFF/ON and reboot cause status Post Test
Args:
Expand All @@ -489,7 +488,7 @@ def post_test_dpus_check(duthost, dpuhosts,
logging.info("Post test DPUs check in parallel")
for dpu in dpu_on_list:
executor.submit(post_test_dpu_check, duthost,
dpuhosts, dpu)
dpuhosts, dpu, reboot_cause)

logging.info("Checking all powered on DPUs connectivity")
ping_status = check_dpu_ping_status(duthost, ip_address_list)
Expand Down
62 changes: 62 additions & 0 deletions tests/smartswitch/common/reboot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import logging
import pytest
from tests.common.reboot import reboot_ss_ctrl_dict as reboot_dict, REBOOT_TYPE_HISTOYR_QUEUE, \
sync_reboot_history_queue_with_dut

logger = logging.getLogger(__name__)

REBOOT_TYPE_COLD = "cold"
REBOOT_TYPE_UNKNOWN = "unknown"
REBOOT_TYPE_KERNEL_PANIC = "Kernel Panic"
REBOOT_TYPE_WATCHDOG = "Watchdog"


def log_and_perform_reboot(duthost, reboot_type, dpu_name):
"""
Logs and initiates the reboot process based on the host type.
Skips the test if the host is a DPU.

@param duthost: DUT host object
@param reboot_type: Type of reboot to perform
@param dpu_name: Name of the DPU (optional)
"""
hostname = duthost.hostname

if reboot_type == REBOOT_TYPE_COLD:
if duthost.is_smartswitch():
if dpu_name is None:
logger.info("Sync reboot cause history queue with DUT reboot cause history queue")
sync_reboot_history_queue_with_dut(hostname)

logger.info("Rebooting the switch {} with type {}".format(hostname, reboot_type))
return duthost.command("sudo reboot")
else:
logger.info("Rebooting the DPU {} with type {}".format(dpu_name, reboot_type))
return duthost.command("sudo reboot -d {}".format(dpu_name))
elif duthost.is_dpu():
pytest.skip("Skipping the reboot test as the DUT is a DPU")
else:
pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type))


def perform_reboot(duthost, reboot_type=REBOOT_TYPE_COLD, dpu_name=None):
"""
Performs a reboot and validates the DPU status after reboot.

@param duthost: DUT host object
@param reboot_type: Reboot type
@param dpu_name: DPU name
"""
if reboot_type not in reboot_dict:
pytest.skip("Skipping the reboot test as the reboot type {} is not supported".format(reboot_type))

res = log_and_perform_reboot(duthost, reboot_type, dpu_name)
if res['failed'] is True:
if dpu_name is None:
pytest.fail("Failed to reboot the {} with type {}".format(duthost.hostname, reboot_type))
else:
pytest.fail("Failed to reboot the DPU {} with type {}".format(dpu_name, reboot_type))

if dpu_name is None:
logger.info("Appending the last reboot type to the queue")
REBOOT_TYPE_HISTOYR_QUEUE.append(reboot_type)
9 changes: 3 additions & 6 deletions tests/smartswitch/platform_tests/test_platform_dpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def test_pcie_link(duthosts, dpuhosts,
duthost.shell("sudo config chassis modules \
startup %s" % (dpu_on_list[index]))

post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware")

logging.info("Verifying output of '{}' on '{}'..."
.format(CMD_PCIE_INFO, duthost.hostname))
Expand Down Expand Up @@ -297,9 +295,8 @@ def test_system_health_summary(duthosts, dpuhosts,
num_dpu_modules)

logging.info("Checking DPU is completely UP")
post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list,
ip_address_list, num_dpu_modules, "Non-Hardware")

logging.info("Checking show system-health summary on Switch")
output_health_summary = duthost.command("show system-health summary")
Expand Down
82 changes: 73 additions & 9 deletions tests/smartswitch/platform_tests/test_reload_dpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import re
from tests.common.platform.processes_utils import wait_critical_processes
from tests.common.reboot import reboot, REBOOT_TYPE_COLD
from tests.smartswitch.common.device_utils_dpu import get_dpu_link_status,\
check_dpu_ping_status, check_dpu_link_and_status, check_dpu_module_status,\
from tests.common.helpers.platform_api import module
from tests.smartswitch.common.device_utils_dpu import check_dpu_link_and_status,\
pre_test_check, post_test_switch_check, post_test_dpus_check,\
check_dpu_reboot_cause, num_dpu_modules # noqa: F401
num_dpu_modules # noqa: F401
from tests.common.platform.device_utils import platform_api_conn, start_platform_api_service # noqa: F401,F403
from tests.smartswitch.common.reboot import perform_reboot
from tests.common.helpers.multi_thread_utils import SafeThreadPoolExecutor

pytestmark = [
pytest.mark.topology('smartswitch')
Expand Down Expand Up @@ -153,9 +155,7 @@ def test_dpu_status_post_dpu_kernel_panic(duthosts, dpuhosts,
dpuhosts[dpu_id].shell(kernel_panic_cmd, executable="/bin/bash")

logging.info("Executing post test dpu check")
post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware")


def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts,
Expand All @@ -182,6 +182,70 @@ def test_dpu_check_post_dpu_mem_exhaustion(duthosts, dpuhosts,
dpuhosts[dpu_id].shell(memory_exhaustion_cmd, executable="/bin/bash")

logging.info("Executing post test dpu check")
post_test_dpus_check(duthost, dpuhosts,
dpu_on_list, dpu_off_list,
ip_address_list, num_dpu_modules)
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list,
num_dpu_modules, "Non-Hardware")


def test_cold_reboot_dpus(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname,
platform_api_conn, num_dpu_modules): # noqa: F811, E501
"""
Test to cold reboot all DPUs in the DUT.
Steps:
1. Perform pre-test checks to gather DPU state.
2. Initiate cold reboot on all DPUs concurrently.
3. Perform post-test checks to verify the state after reboot.

Args:
duthosts: DUT hosts object
dpuhosts: DPU hosts object
enum_rand_one_per_hwsku_hostname: Randomized DUT hostname
platform_api_conn: Platform API connection object
num_dpu_modules: Number of DPU modules to reboot
"""
duthost = duthosts[enum_rand_one_per_hwsku_hostname]

logging.info("Executing pre test check")
ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules)

def reboot_dpu(duthost, platform_api_conn, index):
try:
dpu_name = module.get_name(platform_api_conn, index)
perform_reboot(duthost, REBOOT_TYPE_COLD, dpu_name)
except Exception as e:
logging.error(f"Failed to reboot DPU at index {index}: {e}")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @vvolam , I just found that all exceptions here in rebooting the DPU is ignored, there is only one error printed in the test log. Is this expected test behavior?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @congh-nvidia, do you mean we should differentiate the logging based on the type of exception encountered here?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @vvolam , I mean the exceptions are not raised, the test only log an error. If the reboot command somehow failed and there is no actual reboot, the post check could still pass and the test case passes.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @vvolam, @nikamirrr will raise a PR for the fix along with another fix in this test.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@congh-nvidia sounds good. Thank you!


with SafeThreadPoolExecutor(max_workers=num_dpu_modules) as executor:
logging.info("Rebooting all DPUs in parallel")
for index in range(num_dpu_modules):
executor.submit(reboot_dpu, duthost, platform_api_conn, index)

logging.info("Executing post test dpu check")
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "Non-Hardware")


def test_cold_reboot_switch(duthosts, dpuhosts, enum_rand_one_per_hwsku_hostname,
platform_api_conn, num_dpu_modules): # noqa: F811, E501
"""
Test to cold reboot the switch in the DUT.
Steps:
1. Perform pre-test checks to gather DPU state.
2. Initiate a cold reboot on the switch.
3. Perform post-test checks to verify the state of DPUs after the reboot.

Args:
duthosts: DUT hosts object
dpuhosts: DPU hosts object
enum_rand_one_per_hwsku_hostname: Randomized DUT hostname
platform_api_conn: Platform API connection object
num_dpu_modules: Number of DPU modules to verify
"""
duthost = duthosts[enum_rand_one_per_hwsku_hostname]

logging.info("Executing pre test check")
ip_address_list, dpu_on_list, dpu_off_list = pre_test_check(duthost, platform_api_conn, num_dpu_modules)

logging.info("Starting switch reboot...")
perform_reboot(duthost, REBOOT_TYPE_COLD, None)

logging.info("Executing post switch reboot dpu check")
post_test_dpus_check(duthost, dpuhosts, dpu_on_list, ip_address_list, num_dpu_modules, "reboot")
Loading