Skip to content

Commit 286827f

Browse files
authored
Modify reboot cause on Kernel Panic to give preference to SW cause (sonic-net#296)
Currently, if there's a Kernel Panic and LC reboots, random HW cause is taking precedence over Kernel Panic SW cause. To overcome such scenarios, give preference to SW reboot cause over HW reboot cause
1 parent 0afee37 commit 286827f

2 files changed

Lines changed: 14 additions & 2 deletions

File tree

scripts/determine-reboot-cause

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ REBOOT_CAUSE_UNKNOWN = "Unknown"
4545
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
4646
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
4747
REBOOT_CAUSE_HEARTBEAT_LOSS = "Heartbeat with the Supervisor card lost"
48+
REBOOT_CAUSE_KERNEL_PANIC = "Kernel Panic"
4849

4950
# Global logger class instance
5051
sonic_logger = syslogger.SysLogger(SYSLOG_IDENTIFIER)
@@ -188,7 +189,7 @@ def determine_reboot_cause():
188189
software_reboot_cause = find_software_reboot_cause()
189190

190191
# The main decision logic of the reboot cause:
191-
# If software reboot cause is not heartbeat loss and there is a valid hardware reboot cause indicated by platform API,
192+
# If software reboot cause is not Kernel Panic or heartbeat loss and there is a valid hardware reboot cause indicated by platform API,
192193
# check the software reboot cause to add additional reboot cause.
193194
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
194195
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
@@ -197,7 +198,9 @@ def determine_reboot_cause():
197198
# the software_reboot_cause will be treated as the reboot cause if it's not unknown
198199
# otherwise, the cmdline_reboot_cause will be treated as the reboot cause if it's not none
199200
# Else the software_reboot_cause will be treated as the reboot cause
200-
if REBOOT_CAUSE_HEARTBEAT_LOSS not in software_reboot_cause and REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause:
201+
if (REBOOT_CAUSE_KERNEL_PANIC not in software_reboot_cause and
202+
REBOOT_CAUSE_HEARTBEAT_LOSS not in software_reboot_cause and
203+
REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause):
201204
previous_reboot_cause = hardware_reboot_cause
202205
# Check if any software reboot was issued before this hardware reboot happened
203206
if software_reboot_cause is not REBOOT_CAUSE_UNKNOWN:

tests/determine-reboot-cause_test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
EXPECTED_FIND_FIRSTBOOT_VERSION = " (First boot of SONiC version 20191130.52)"
7373
EXPECTED_FIND_SOFTWARE_REBOOT_CAUSE_FIRSTBOOT = "Unknown (First boot of SONiC version 20191130.52)"
7474
EXPECTED_FIND_SOFTWARE_HEATBEAT_LOSS = "Heartbeat with the Supervisor card lost"
75+
EXPECTED_FIND_SOFTWARE_KERNEL_PANIC = "Kernel Panic [Time: Sun Mar 28 13:45:12 UTC 2021]"
7576

7677
EXPECTED_WATCHDOG_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_15_08', 'cause': 'Watchdog', 'user': 'N/A', 'time': 'N/A'}
7778
EXPECTED_USER_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2020_10_22_03_14_07', 'cause': 'reboot', 'user': 'admin', 'time': 'Thu Oct 22 03:11:08 UTC 2020'}
@@ -209,6 +210,14 @@ def test_determine_reboot_cause_software_heartbeatloss_hardware_other(self):
209210
assert previous_reboot_cause == EXPECTED_FIND_SOFTWARE_HEATBEAT_LOSS
210211
assert additional_info == "N/A"
211212

213+
def test_determine_reboot_cause_software_kernelpanic_hardware_other(self):
214+
with mock.patch("determine_reboot_cause.find_proc_cmdline_reboot_cause", return_value=EXPECTED_PARSE_WARMFAST_REBOOT_FROM_PROC_CMDLINE):
215+
with mock.patch("determine_reboot_cause.find_software_reboot_cause", return_value=EXPECTED_FIND_SOFTWARE_KERNEL_PANIC):
216+
with mock.patch("determine_reboot_cause.find_hardware_reboot_cause", return_value=EXPECTED_HARDWARE_REBOOT_CAUSE):
217+
previous_reboot_cause, additional_info = determine_reboot_cause.determine_reboot_cause()
218+
assert previous_reboot_cause == EXPECTED_FIND_SOFTWARE_KERNEL_PANIC
219+
assert additional_info == "N/A"
220+
212221
@mock.patch('determine_reboot_cause.REBOOT_CAUSE_DIR', os.path.join(os.getcwd(), REBOOT_CAUSE_DIR))
213222
@mock.patch('determine_reboot_cause.REBOOT_CAUSE_HISTORY_DIR', os.path.join(os.getcwd(), 'host/reboot-cause/history/'))
214223
@mock.patch('determine_reboot_cause.PREVIOUS_REBOOT_CAUSE_FILE', os.path.join(os.getcwd(), 'host/reboot-cause/previous-reboot-cause.json'))

0 commit comments

Comments
 (0)