-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Modified reboot pre-shutdown script to handle dpu side reboot #26234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e0a3919
77b3429
9c957e4
547adb5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,90 @@ LOG_MSG "Pre-reboot flush and cache clear complete." | |
|
|
||
| sleep 2 | ||
| LOG_MSG "Writing pre-shutdown success to CPLD reg" | ||
| docker exec "$(cat /host/dpu-docker-info/name)" /nic/bin/cpldapp -w 0xd 200 | ||
|
|
||
| DPU_NAME_FILE="/host/dpu-docker-info/name" | ||
| if [ ! -s "$DPU_NAME_FILE" ]; then | ||
| LOG_MSG "Error: DPU container name file '$DPU_NAME_FILE' is missing or empty; aborting pre-reboot hook" | ||
| exit 1 | ||
| else | ||
| DPU_CONTAINER_NAME=$(cat "$DPU_NAME_FILE") | ||
| docker exec "$DPU_CONTAINER_NAME" /nic/bin/cpldapp -w 0xd 200 | ||
| fi | ||
|
|
||
| # Extract midplane interface name and host IP from systemd network configuration | ||
| MIDPLANE_NETWORK_FILE="/usr/lib/systemd/network/bridge-midplane.network" | ||
| if [ ! -f "$MIDPLANE_NETWORK_FILE" ]; then | ||
| LOG_MSG "ERROR: Midplane network config file '$MIDPLANE_NETWORK_FILE' not found, aborting pre-reboot hook" | ||
| exit 1 | ||
| fi | ||
|
|
||
| MIDPLANE_IFACE=$(awk -F= '/^\[Match\]/,/^\[Network\]/ { if ($1 == "Name") {print $2; exit} }' "$MIDPLANE_NETWORK_FILE") | ||
| if [ -z "$MIDPLANE_IFACE" ]; then | ||
| LOG_MSG "ERROR: Failed to extract interface name from '$MIDPLANE_NETWORK_FILE', aborting pre-reboot hook" | ||
| exit 1 | ||
| fi | ||
| LOG_MSG "Using midplane interface: $MIDPLANE_IFACE" | ||
|
|
||
| # Extract host IP from Address field (strip subnet mask) | ||
| HOST_IP=$(grep '^Address=' "$MIDPLANE_NETWORK_FILE" | cut -d'=' -f2 | cut -d'/' -f1) | ||
| if [ -z "$HOST_IP" ]; then | ||
| LOG_MSG "ERROR: Failed to extract host IP from '$MIDPLANE_NETWORK_FILE', aborting pre-reboot hook" | ||
| exit 1 | ||
| fi | ||
| LOG_MSG "Using host IP from bridge-midplane config: $HOST_IP" | ||
|
|
||
| # Spawn fully independent background process to ping host and trigger power cycle if unreachable | ||
| # Algorithm: If ping fails, wait 10 seconds and retry. After 3 consecutive failures, trigger power cycle. | ||
| # Using setsid + nohup to completely detach from parent process (daemonize) | ||
|
|
||
| setsid nohup bash -c " | ||
| DPU_CONTAINER='$DPU_CONTAINER_NAME' | ||
| HOST_IP='$HOST_IP' | ||
| MIDPLANE_IFACE='$MIDPLANE_IFACE' | ||
| TIMEOUT=120 | ||
| POLL_INTERVAL=5 | ||
| ELAPSED=0 | ||
| MAX_FAILURES=3 | ||
| RETRY_WAIT=10 | ||
| FAIL_COUNT=0 | ||
|
|
||
| LAST_STATE=\"unknown\" | ||
| while [ \$ELAPSED -lt \$TIMEOUT ]; do | ||
| if ping -I \"\$MIDPLANE_IFACE\" -c 1 -W 1 \"\$HOST_IP\" > /dev/null 2>&1; then | ||
| # Ping succeeded - reset failure counter | ||
| if [ \$FAIL_COUNT -gt 0 ]; then | ||
| echo \"Ping to \$HOST_IP recovered after \$FAIL_COUNT failures\" | tee /dev/kmsg /dev/console | ||
| FAIL_COUNT=0 | ||
| fi | ||
| # Log success only when transitioning to reachable state | ||
| if [ \"\$LAST_STATE\" != \"up\" ]; then | ||
| echo \"Ping to \$HOST_IP successful, host is still reachable\" | tee /dev/kmsg /dev/console | ||
| LAST_STATE=\"up\" | ||
| fi | ||
| else | ||
| # Ping failed - increment failure counter | ||
| FAIL_COUNT=\$((FAIL_COUNT + 1)) | ||
| echo \"Ping to \$HOST_IP failed (\$FAIL_COUNT/\$MAX_FAILURES)\" | tee /dev/kmsg /dev/console | ||
|
|
||
| # If we exhausted all retries (3 consecutive failures), trigger power cycle | ||
| if [ \$FAIL_COUNT -ge \$MAX_FAILURES ]; then | ||
| echo \"Ping failure \$FAIL_COUNT/\$MAX_FAILURES, all retries exhausted\" | tee /dev/kmsg /dev/console | ||
| echo \"Triggering cpld power cycle after \$MAX_FAILURES consecutive ping failures\" | tee /dev/kmsg /dev/console | ||
| docker exec \"\$DPU_CONTAINER\" /nic/bin/cpldapp -pwrcycle | ||
| exit 0 | ||
|
Comment on lines
+94
to
+99
|
||
| fi | ||
|
|
||
| # Wait before next retry | ||
| echo \"Waiting \$RETRY_WAIT seconds before retry...\" | tee /dev/kmsg /dev/console | ||
| sleep \$RETRY_WAIT | ||
| ELAPSED=\$((ELAPSED + RETRY_WAIT)) | ||
| continue | ||
| fi | ||
| sleep \$POLL_INTERVAL | ||
| ELAPSED=\$((ELAPSED + POLL_INTERVAL)) | ||
| done | ||
| echo \"Ping to \$HOST_IP remained successful for \$TIMEOUT seconds, no power cycle triggered\" | tee /dev/kmsg /dev/console | ||
| " </dev/null >/dev/null 2>&1 & | ||
|
|
||
| LOG_MSG "Platform specific pre-shutdown is successful" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The script logs that it is "skipping CPLD operations" when the DPU container name file is missing/empty, but then exits with status 1. That makes this path look like a hard failure rather than a best-effort skip and can break the reboot flow on systems where /host/dpu-docker-info/name is not populated. Consider returning success (exit 0) or continuing without CPLD operations instead of exiting non-zero here.