Skip to content

Commit 61c6475

Browse files
lizhijianrdarista-hpandya
authored andcommitted
[test_ro_disk] Recover DUT to RW state by power-cycle when reboot doesn't work (sonic-net#13974)
What is the motivation for this PR? On some platforms, DUT cannot be recovered from RO-disk state by reboot. (e.g., On Nokia-7215, we saw the reboot is blocked by systemd-journald.service) To avoid DUT stuck at RO disk state, this PR introduce power-cycle as the final approach to recover DUT. How did you do it? If reboot failed to recover DUT from RO disk state, try power-cycle to recover the DUT. How did you verify/test it? Verified on Nokia-7215 M0 testbed. Get test passed with below logs: tacacs/test_ro_disk.py::test_ro_disk[dut-7215-4] -------------------------------------------------------------------------------- live log call -------------------------------------------------------------------------------- 10:02:17 test_ro_disk.do_reboot L0089 ERROR | DUT did not go down, exception: run module command failed, Ansible Results => {"failed": true, "msg": "Timeout (62s) waiting for privilege escalation prompt: "} attempt:0/3 10:04:02 test_ro_disk.do_reboot L0089 ERROR | DUT did not go down, exception: run module command failed, Ansible Results => {"failed": true, "msg": "Timeout (62s) waiting for privilege escalation prompt: "} attempt:1/3 10:05:24 test_ro_disk.do_reboot L0089 ERROR | DUT did not go down, exception: run module command failed, Ansible Results => {"failed": true, "msg": "Timeout (62s) waiting for privilege escalation prompt: "} attempt:2/3 10:05:44 test_ro_disk.do_reboot L0095 ERROR | Failed to reboot DUT after 3 retries 10:05:44 test_ro_disk.test_ro_disk L0262 WARNING| Failed to reboot dut-7215-4, try PDU reboot to restore disk RW state PASSED
1 parent 0ed92c1 commit 61c6475

2 files changed

Lines changed: 52 additions & 8 deletions

File tree

tests/common/utilities.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,29 @@ def get_plt_reboot_ctrl(duthost, tc_name, reboot_type):
760760
return reboot_dict
761761

762762

763+
def pdu_reboot(pdu_controller):
764+
"""Power-cycle the DUT by turning off and on the PDU outlets.
765+
766+
Args:
767+
pdu_controller: PDU controller object implementing the BasePduController interface.
768+
User can acquire pdu_controller object from fixture tests.common.plugins.pdu_controller.pdu_controller
769+
770+
Returns: True if the PDU reboot is successful, False otherwise.
771+
"""
772+
if not pdu_controller:
773+
logging.warning("pdu_controller is None, skip PDU reboot")
774+
return False
775+
hostname = pdu_controller.dut_hostname
776+
if not pdu_controller.turn_off_outlet():
777+
logging.error("Turn off the PDU outlets of {} failed".format(hostname))
778+
return False
779+
time.sleep(10) # sleep 10 second to ensure there is gap between power off and on
780+
if not pdu_controller.turn_on_outlet():
781+
logging.error("Turn on the PDU outlets of {} failed".format(hostname))
782+
return False
783+
return True
784+
785+
763786
def get_image_type(duthost):
764787
"""get the SONiC image type
765788
It might be public/microsoft/...or any other type.

tests/tacacs/test_ro_disk.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from tests.common.utilities import wait_until
99
from tests.common.utilities import skip_release
1010
from tests.common.utilities import wait
11+
from tests.common.utilities import pdu_reboot
1112
from tests.common.reboot import reboot
1213
from .test_ro_user import ssh_remote_run
1314
from .utils import setup_tacacs_client, change_and_wait_aaa_config_update
@@ -55,6 +56,13 @@ def chk_ssh_remote_run(localhost, remote_ip, username, password, cmd):
5556
return rc == 0
5657

5758

59+
def do_pdu_reboot(duthost, localhost, duthosts, pdu_controller):
60+
if not pdu_reboot(pdu_controller):
61+
logger.error("Failed to do PDU reboot for {}".format(duthost.hostname))
62+
return
63+
return post_reboot_healthcheck(duthost, localhost, duthosts, 20)
64+
65+
5866
def do_reboot(duthost, localhost, duthosts):
5967
# occasionally reboot command fails with some kernel error messages
6068
# Hence retry if needed.
@@ -83,20 +91,30 @@ def do_reboot(duthost, localhost, duthosts):
8391

8492
wait(wait_time, msg="Wait {} seconds before retry.".format(wait_time))
8593

86-
assert rebooted, "Failed to reboot"
94+
if not rebooted:
95+
logger.error("Failed to reboot DUT after {} retries".format(retries))
96+
return False
97+
98+
return post_reboot_healthcheck(duthost, localhost, duthosts, wait_time)
99+
100+
101+
def post_reboot_healthcheck(duthost, localhost, duthosts, wait_time):
87102
localhost.wait_for(host=duthost.mgmt_ip, port=22, state="started", delay=10, timeout=300)
88103
wait(wait_time, msg="Wait {} seconds for system to be stable.".format(wait_time))
89-
assert wait_until(300, 20, 0, duthost.critical_services_fully_started), \
90-
"All critical services should fully started!"
104+
if not wait_until(300, 20, 0, duthost.critical_services_fully_started):
105+
logger.error("Not all critical services fully started!")
106+
return False
91107
# If supervisor node is rebooted in chassis, linecards also will reboot.
92108
# Check if all linecards are back up.
93109
if duthost.is_supervisor_node():
94110
for host in duthosts:
95111
if host != duthost:
96112
logger.info("checking if {} critical services are up".format(host.hostname))
97113
wait_critical_processes(host)
98-
assert wait_until(300, 20, 0, check_interface_status_of_up_ports, host), \
99-
"Not all ports that are admin up on are operationally up"
114+
if not wait_until(300, 20, 0, check_interface_status_of_up_ports, host):
115+
logger.error("Not all ports that are admin up on are operationally up")
116+
return False
117+
return True
100118

101119

102120
def do_setup_tacacs(ptfhost, duthost, tacacs_creds):
@@ -142,7 +160,7 @@ def log_rotate(duthost):
142160

143161

144162
def test_ro_disk(localhost, ptfhost, duthosts, enum_rand_one_per_hwsku_hostname,
145-
tacacs_creds, check_tacacs):
163+
tacacs_creds, check_tacacs, pdu_controller):
146164
"""test tacacs rw user
147165
"""
148166
duthost = duthosts[enum_rand_one_per_hwsku_hostname]
@@ -162,7 +180,7 @@ def test_ro_disk(localhost, ptfhost, duthosts, enum_rand_one_per_hwsku_hostname,
162180
#
163181
logger.info("PRETEST: reboot {} to restore system state".
164182
format(enum_rand_one_per_hwsku_hostname))
165-
do_reboot(duthost, localhost, duthosts)
183+
assert do_reboot(duthost, localhost, duthosts), "Failed to reboot"
166184
assert do_check_clean_state(duthost), "state not good even after reboot"
167185
do_setup_tacacs(ptfhost, duthost, tacacs_creds)
168186

@@ -240,7 +258,10 @@ def test_ro_disk(localhost, ptfhost, duthosts, enum_rand_one_per_hwsku_hostname,
240258
finally:
241259
logger.debug("START: reboot {} to restore disk RW state".
242260
format(enum_rand_one_per_hwsku_hostname))
243-
do_reboot(duthost, localhost, duthosts)
261+
if not do_reboot(duthost, localhost, duthosts):
262+
logger.warning("Failed to reboot {}, try PDU reboot to restore disk RW state".
263+
format(enum_rand_one_per_hwsku_hostname))
264+
do_pdu_reboot(duthost, localhost, duthosts, pdu_controller)
244265
logger.debug(" END: reboot {} to restore disk RW state".
245266
format(enum_rand_one_per_hwsku_hostname))
246267

0 commit comments

Comments
 (0)