sonic-net · yxieca · Apr 3, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 31, 2026
diff --git a/device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook b/device/pensando/arm64-elba-asic-flash128-r0/pre_reboot_hook
@@ -26,7 +26,90 @@ LOG_MSG "Pre-reboot flush and cache clear complete."
 
 sleep 2
 LOG_MSG "Writing pre-shutdown success to CPLD reg"
-docker exec "$(cat /host/dpu-docker-info/name)" /nic/bin/cpldapp -w 0xd 200
+
+DPU_NAME_FILE="/host/dpu-docker-info/name"
+if [ ! -s "$DPU_NAME_FILE" ]; then
+    LOG_MSG "Error: DPU container name file '$DPU_NAME_FILE' is missing or empty; aborting pre-reboot hook"
+    exit 1
-    exit 1
-    exit 1
+else
+    DPU_CONTAINER_NAME=$(cat "$DPU_NAME_FILE")
+    docker exec "$DPU_CONTAINER_NAME" /nic/bin/cpldapp -w 0xd 200
+fi
+
+# Extract midplane interface name and host IP from systemd network configuration
+MIDPLANE_NETWORK_FILE="/usr/lib/systemd/network/bridge-midplane.network"
+if [ ! -f "$MIDPLANE_NETWORK_FILE" ]; then
+    LOG_MSG "ERROR: Midplane network config file '$MIDPLANE_NETWORK_FILE' not found, aborting pre-reboot hook"
+    exit 1
+fi
+
+MIDPLANE_IFACE=$(awk -F= '/^\[Match\]/,/^\[Network\]/ { if ($1 == "Name") {print $2; exit} }' "$MIDPLANE_NETWORK_FILE")
+if [ -z "$MIDPLANE_IFACE" ]; then
+    LOG_MSG "ERROR: Failed to extract interface name from '$MIDPLANE_NETWORK_FILE', aborting pre-reboot hook"
+    exit 1
+fi
+LOG_MSG "Using midplane interface: $MIDPLANE_IFACE"
+
+# Extract host IP from Address field (strip subnet mask)
+HOST_IP=$(grep '^Address=' "$MIDPLANE_NETWORK_FILE" | cut -d'=' -f2 | cut -d'/' -f1)
+if [ -z "$HOST_IP" ]; then
+    LOG_MSG "ERROR: Failed to extract host IP from '$MIDPLANE_NETWORK_FILE', aborting pre-reboot hook"
+    exit 1
+fi
+LOG_MSG "Using host IP from bridge-midplane config: $HOST_IP"
+
+# Spawn fully independent background process to ping host and trigger power cycle if unreachable
+# Algorithm: If ping fails, wait 10 seconds and retry. After 3 consecutive failures, trigger power cycle.
+# Using setsid + nohup to completely detach from parent process (daemonize)
+
+setsid nohup bash -c "
+    DPU_CONTAINER='$DPU_CONTAINER_NAME'
+    HOST_IP='$HOST_IP'
+    MIDPLANE_IFACE='$MIDPLANE_IFACE'
+    TIMEOUT=120
+    POLL_INTERVAL=5
+    ELAPSED=0
+    MAX_FAILURES=3
+    RETRY_WAIT=10
+    FAIL_COUNT=0
+
+    LAST_STATE=\"unknown\"
+    while [ \$ELAPSED -lt \$TIMEOUT ]; do
+        if ping -I \"\$MIDPLANE_IFACE\" -c 1 -W 1 \"\$HOST_IP\" > /dev/null 2>&1; then
+            # Ping succeeded - reset failure counter
+            if [ \$FAIL_COUNT -gt 0 ]; then
+                echo \"Ping to \$HOST_IP recovered after \$FAIL_COUNT failures\" | tee /dev/kmsg /dev/console
+                FAIL_COUNT=0
+            fi
+            # Log success only when transitioning to reachable state
+            if [ \"\$LAST_STATE\" != \"up\" ]; then
+                echo \"Ping to \$HOST_IP successful, host is still reachable\" | tee /dev/kmsg /dev/console
+                LAST_STATE=\"up\"
+            fi
+        else
+            # Ping failed - increment failure counter
+            FAIL_COUNT=\$((FAIL_COUNT + 1))
+            echo \"Ping to \$HOST_IP failed (\$FAIL_COUNT/\$MAX_FAILURES)\" | tee /dev/kmsg /dev/console
+
+            # If we exhausted all retries (3 consecutive failures), trigger power cycle
+            if [ \$FAIL_COUNT -ge \$MAX_FAILURES ]; then
+                echo \"Ping failure \$FAIL_COUNT/\$MAX_FAILURES, all retries exhausted\" | tee /dev/kmsg /dev/console
+                echo \"Triggering cpld power cycle after \$MAX_FAILURES consecutive ping failures\" | tee /dev/kmsg /dev/console
+                docker exec \"\$DPU_CONTAINER\" /nic/bin/cpldapp -pwrcycle
+                exit 0
+            fi
+
+            # Wait before next retry
+            echo \"Waiting \$RETRY_WAIT seconds before retry...\" | tee /dev/kmsg /dev/console
+            sleep \$RETRY_WAIT
+            ELAPSED=\$((ELAPSED + RETRY_WAIT))
+            continue
+        fi
+        sleep \$POLL_INTERVAL
+        ELAPSED=\$((ELAPSED + POLL_INTERVAL))
+    done
+    echo \"Ping to \$HOST_IP remained successful for \$TIMEOUT seconds, no power cycle triggered\" | tee /dev/kmsg /dev/console
+" </dev/null >/dev/null 2>&1 &
 
 LOG_MSG "Platform specific pre-shutdown is successful"
 
@@ -74,6 +74,12 @@ function start_polaris()
     mkdir -p $DPU_DOCKER_INFO_DIR
     echo 256 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
 
+    if [ -w /sys/firmware/pensando/reboot/panic_reboot ]; then
+        echo 1 > /sys/firmware/pensando/reboot/panic_reboot
+    else
+        log_msg "Pensando panic_reboot sysfs node not writable; skipping configuration"
+    fi
+
     sync; sync;
     sleep 3;