-
Notifications
You must be signed in to change notification settings - Fork 1.8k
[process-reboot-cause]Address the issue: Incorrect reboot cause returned when warm reboot follows a hardware caused reboot #3880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
66a9a25
757d82b
70a7df1
0ea7089
8fb11e3
f37d08d
00603a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ try: | |
| import pwd | ||
| import sys | ||
| import syslog | ||
| import re | ||
| except ImportError as err: | ||
| raise ImportError("%s - required module not found" % str(err)) | ||
|
|
||
|
|
@@ -22,6 +23,15 @@ REBOOT_CAUSE_DIR = "/host/reboot-cause/" | |
| REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "reboot-cause.txt" | ||
| PREVIOUS_REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "previous-reboot-cause.txt" | ||
| FIRST_BOOT_PLATFORM_FILE = "/tmp/notify_firstboot_to_platform" | ||
| REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline" | ||
| # The following SONIC_BOOT_TYPEs come from the warm/fast reboot script which is in sonic-utilities | ||
| # Because the system can be rebooted from some old versions, we have to take all possible BOOT options into consideration. | ||
| # On 201803, 201807 we have | ||
| # BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') fast-reboot" | ||
| # On 201811 and later we have | ||
| # BOOT_OPTIONS="$(echo $KERNEL_OPTIONS | sed -e 's/\s*linux\s*/BOOT_IMAGE=/') SONIC_BOOT_TYPE=${BOOT_TYPE_ARG}" where BOOT_TYPE_ARG can be warm, fastfast or fast | ||
| # To extract the commom part of them, we should have the following PATTERN | ||
| REBOOT_TYPE_KEXEC_PATTERN = ".*(fast-reboot|warm|fastfast|fast).*" | ||
|
|
||
| UNKNOWN_REBOOT_CAUSE = "Unknown" | ||
|
|
||
|
|
@@ -47,6 +57,16 @@ def log_error(msg): | |
|
|
||
|
|
||
| # ============================= Functions ============================= | ||
| def is_warmfast_reboot_from_proc_cmdline(): | ||
| if os.path.isfile(REBOOT_TYPE_KEXEC_FILE): | ||
| with open(REBOOT_TYPE_KEXEC_FILE, "r") as cause_file: | ||
| cause_file_kexec = cause_file.readline() | ||
| m = re.match(REBOOT_TYPE_KEXEC_PATTERN, cause_file_kexec) | ||
| if m and m.group(1): | ||
| # the pattern matched so it's a fast/warm reboot | ||
| return True | ||
| return False | ||
|
|
||
|
|
||
| def main(): | ||
| log_info("Starting up...") | ||
|
|
@@ -73,40 +93,55 @@ def main(): | |
| try: | ||
| import sonic_platform | ||
|
|
||
| # Check if the previous reboot was caused by hardware | ||
| platform = sonic_platform.platform.Platform() | ||
|
|
||
| chassis = platform.get_chassis() | ||
|
|
||
| hardware_reboot_cause, optional_details = chassis.get_reboot_cause() | ||
| proc_cmdline_reboot_cause = None | ||
|
|
||
| if hardware_reboot_cause == chassis.REBOOT_CAUSE_NON_HARDWARE: | ||
| # The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will | ||
| # contain any software-related reboot info. We will use it as the previous cause. | ||
| # 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline | ||
| # If yes, the content of /hosts/reboot-cause/reboot-cause.txt will be treated as the reboot cause | ||
| if is_warmfast_reboot_from_proc_cmdline(): | ||
| if os.path.isfile(REBOOT_CAUSE_FILE): | ||
| cause_file = open(REBOOT_CAUSE_FILE, "r") | ||
| previous_reboot_cause = cause_file.readline().rstrip('\n') | ||
| cause_file.close() | ||
| # If it is FirstTime Boot and previous_reboot_cause is unknown | ||
| # and hardware_reboot cause is non_hardware then | ||
| # Update the reboot cause as required | ||
| if os.path.isfile(FIRST_BOOT_PLATFORM_FILE): | ||
| if (previous_reboot_cause == UNKNOWN_REBOOT_CAUSE): | ||
| previous_reboot_cause = UNKNOWN_REBOOT_CAUSE | ||
| os.remove(FIRST_BOOT_PLATFORM_FILE) | ||
| elif hardware_reboot_cause == chassis.REBOOT_CAUSE_HARDWARE_OTHER: | ||
| previous_reboot_cause = "{} ({})".format(hardware_reboot_cause, optional_details) | ||
| with open(REBOOT_CAUSE_FILE, "r") as cause_file: | ||
| proc_cmdline_reboot_cause = cause_file.readline().rstrip('\n') | ||
| else: | ||
| # /proc/cmdline says it's a warm/fast reboot but /host/reboot-cause.txt doesn't exist. | ||
| # report an error. | ||
| log_error("/proc/cmdline indicates a fast/warm reboot but {} doesn't exist".format(REBOOT_CAUSE_DIR)) | ||
|
||
|
|
||
| if proc_cmdline_reboot_cause is not None: | ||
| previous_reboot_cause = proc_cmdline_reboot_cause | ||
| else: | ||
| previous_reboot_cause = hardware_reboot_cause | ||
| # 2. Check if the previous reboot was caused by hardware | ||
| # If yes, the hardware reboot cause will be treated as teh reboot cause | ||
|
||
| platform = sonic_platform.platform.Platform() | ||
|
|
||
| chassis = platform.get_chassis() | ||
|
|
||
| hardware_reboot_cause, optional_details = chassis.get_reboot_cause() | ||
|
|
||
| if hardware_reboot_cause == chassis.REBOOT_CAUSE_NON_HARDWARE: | ||
| # The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will | ||
| # contain any software-related reboot info. We will use it as the previous cause. | ||
| if os.path.isfile(REBOOT_CAUSE_FILE): | ||
| with open(REBOOT_CAUSE_FILE, "r") as cause_file: | ||
| previous_reboot_cause = cause_file.readline().rstrip('\n') | ||
| # If it is FirstTime Boot and previous_reboot_cause is unknown | ||
| # and hardware_reboot cause is non_hardware then | ||
| # Update the reboot cause as required | ||
| if os.path.isfile(FIRST_BOOT_PLATFORM_FILE): | ||
| if (previous_reboot_cause == UNKNOWN_REBOOT_CAUSE): | ||
| previous_reboot_cause = UNKNOWN_REBOOT_CAUSE | ||
| os.remove(FIRST_BOOT_PLATFORM_FILE) | ||
|
||
| elif hardware_reboot_cause == chassis.REBOOT_CAUSE_HARDWARE_OTHER: | ||
| previous_reboot_cause = "{} ({})".format(hardware_reboot_cause, optional_details) | ||
| else: | ||
| previous_reboot_cause = hardware_reboot_cause | ||
| except ImportError as err: | ||
| log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.") | ||
|
|
||
| # If there is a REBOOT_CAUSE_FILE, it will contain any software-related | ||
| # reboot info. We will use it as the previous cause. | ||
| if os.path.isfile(REBOOT_CAUSE_FILE): | ||
| cause_file = open(REBOOT_CAUSE_FILE, "r") | ||
| previous_reboot_cause = cause_file.readline().rstrip('\n') | ||
| cause_file.close() | ||
| with open(REBOOT_CAUSE_FILE, "r") as cause_file: | ||
| previous_reboot_cause = cause_file.readline().rstrip('\n') | ||
|
|
||
| # If it is FirstTime Boot and previous_reboot_cause is unknown | ||
| # Update the reboot cause as required | ||
|
|
@@ -115,9 +150,8 @@ def main(): | |
| previous_reboot_cause = UNKNOWN_REBOOT_CAUSE | ||
| os.remove(FIRST_BOOT_PLATFORM_FILE) | ||
| # Write the previous reboot cause to PREVIOUS_REBOOT_CAUSE_FILE | ||
| prev_cause_file = open(PREVIOUS_REBOOT_CAUSE_FILE, "w") | ||
| prev_cause_file.write(previous_reboot_cause) | ||
| prev_cause_file.close() | ||
| with open(PREVIOUS_REBOOT_CAUSE_FILE, "w") as prev_cause_file: | ||
| prev_cause_file.write(previous_reboot_cause) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not an issue for this PR or change request for this PR. Just want to start the discussion here. This change is the result of a team brainstorm of various combinations of hard/soft reboot reason. We fell that we've covered all the cases for now. But in case we didn't. We could consider change the previous reboot cause file to 3 lines: Reboot cause: (calculation result) Not sure if any other part of our code has dependency on this file being single line?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. personally I agree your point. probably it's better to only display in a 3-line way in case of we known for sure there is conflicts among causes from different sources?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You got a good point. However, if we know there is a conflict, then we would have figured it out and fixed it in code. We cannot predict what we don't know yet. :-) |
||
|
|
||
| # Also log the previous reboot cause to the syslog | ||
| log_info("Previous reboot cause: {}".format(previous_reboot_cause)) | ||
|
|
@@ -127,9 +161,8 @@ def main(): | |
| os.remove(REBOOT_CAUSE_FILE) | ||
|
|
||
| # Write a new default reboot cause file for the next reboot | ||
| cause_file = open(REBOOT_CAUSE_FILE, "w") | ||
| cause_file.write(UNKNOWN_REBOOT_CAUSE) | ||
| cause_file.close() | ||
| with open(REBOOT_CAUSE_FILE, "w") as cause_file: | ||
| cause_file.write(UNKNOWN_REBOOT_CAUSE) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.