From 834fe82a4302e7ec27c9230bb4caf9a352db56d4 Mon Sep 17 00:00:00 2001
From: root <root@slurm-sa-mi325-controller-02.ord.vultr.cpe.ice.amd.com>
Date: Tue, 31 Mar 2026 17:05:53 +0000
Subject: [PATCH 01/41] Add MI325X DeepSeek-R1 FP8 disaggregated inference
 (1P1D, Broadcom Thor 2 IBGDA)

Port the MI355X disagg recipe to MI325X (gfx942/CDNA3) on a Vultr Slurm cluster
with Broadcom BCM5760X Thor 2 NICs using IBGDA for GPU-Direct RDMA via MoRI.

Container image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt
Built from akao-amd/sglang rocm.Dockerfile with:
  - GPU_ARCH=gfx942, ENABLE_MORI=1, NIC_BACKEND=ibgda
  - Broadcom bnxt_rocelib (bcm5760x_231.2.63.0a) for RDMA userspace
  - MoRI pinned to HEAD (c0eccaf2) for bundled bnxt headers + dlopen
  - smg-wasm pinned to =1.0.0 (v1.0.1 breaks sgl-model-gateway v0.5.9 API)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 149 +++++++++++++
 .github/configs/runners.yaml                  |   5 +
 benchmarks/multi_node/amd_utils/env.sh        |   3 +
 benchmarks/multi_node/amd_utils/job.slurm     |  20 +-
 benchmarks/multi_node/amd_utils/server.sh     |   3 +-
 .../dsr1_fp8_mi325x_sglang-disagg.sh          |  82 +++++++
 runners/launch_mi325x-amd.sh                  | 206 +++++++++++++++---
 scripts/manual-test-mi325x.sh                 |  37 ++++
 8 files changed, 467 insertions(+), 38 deletions(-)
 create mode 100755 benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh
 create mode 100755 scripts/manual-test-mi325x.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e84fc0da5..6da2a4e22 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1231,3 +1231,152 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp8-mi325x-sglang-disagg:
+  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    - spec-decoding: "none"
+      conc-list: [ 768, 512, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024 ]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 1251e459d..f61e81e36 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -75,6 +75,11 @@ mi325x:
 - 'mi325x-amd_1'
 - 'mi325x-amd_2'
 - 'mi325x-amd_3'
+mi325x-disagg:
+- 'mi325x-amd_0'
+- 'mi325x-amd_1'
+- 'mi325x-amd_2'
+- 'mi325x-amd_3'
 mi355x:
 - 'mi355x-amds_0'
 - 'mi355x-amds_1'
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 5565c5b3b..56572dfcf 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then
         export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
     elif [[ $NODENAME == mia1* ]]; then
         export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
+        export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
     else
         echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
         exit 1
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 6b0352f24..0e8f465f5 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
-if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+# MODEL_YAML_KEY is the models.yaml lookup key (bare model name, e.g. DeepSeek-R1-0528).
+# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
+
+# Validate the yaml key exists as a top-level key in models.yaml
+if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
+    echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
-echo "Model found: $MODEL_NAME"
+echo "Model found: $_MODEL_YAML_KEY"
 
 # All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
@@ -249,10 +253,9 @@ echo "NNODES is ${NNODES}"
 echo "REPO Directory is ${DI_REPO_DIR}"
 echo "USER_NAME is ${USER_NAME}"
 
-# Get the RDMA priority and DSCP value from the NIC
+# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
 if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
+    echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
 fi
 
 # Reduce log spam
@@ -357,7 +360,7 @@ exec sudo docker run --rm \
     --privileged \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
+    $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
@@ -373,6 +376,7 @@ exec sudo docker run --rm \
     -e xP=\$xP \
     -e yD=\$yD \
     -e MODEL_NAME=\$MODEL_NAME \
+    -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
     -e IPADDRS=\$IPADDRS \
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7f174b760..b477790b3 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -72,11 +72,12 @@ fi
 # Load model config via inline Python (PyYAML is available in SGLang containers)
 # Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
 # is done here in Python to avoid bash glob-expanding the * characters.
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
 eval "$(python3 -c "
 import yaml, sys, os
 
 config_path = '${MODELS_YAML}'
-model_name = '${MODEL_NAME}'
+model_name = '${_MODEL_YAML_KEY}'
 
 with open(config_path) as f:
     models = yaml.safe_load(f)
diff --git a/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh
new file mode 100755
index 000000000..6a7314ab4
--- /dev/null
+++ b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 67f93a309..4e76c205a 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -4,37 +4,185 @@ export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
 export PORT=8888
 
 PARTITION="compute"
-SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-LOCK_FILE="${SQUASH_FILE}.lock"
 
-set -x
-
-JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
-
-if [ -z "$JOB_ID" ]; then
-    echo "ERROR: salloc failed to allocate a job"
+# Detect benchmark subdir from where the script lives
+SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh"
+if [[ -f "benchmarks/multi_node/${SCRIPT_NAME}" ]]; then
+    BENCHMARK_SUBDIR="multi_node"
+elif [[ -f "benchmarks/single_node/${SCRIPT_NAME}" ]]; then
+    BENCHMARK_SUBDIR="single_node"
+else
+    echo "ERROR: ${SCRIPT_NAME} not found in benchmarks/multi_node or benchmarks/single_node"
     exit 1
 fi
 
-# Use flock to serialize concurrent imports to the same squash file
-srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
-    exec 9>\"$LOCK_FILE\"
-    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-        echo 'Squash file already exists and is valid, skipping import'
-    else
-        rm -f \"$SQUASH_FILE\"
-        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+# =============================================================================
+# Multi-node disaggregated path: sbatch + Docker via submit.sh
+# =============================================================================
+if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
+
+    scancel_sync() {
+        local jobid=$1
+        local timeout=${2:-600}
+        local interval=10
+        local start
+        start=$(date +%s)
+
+        echo "[scancel_sync] Requesting cancel of job $jobid"
+        scancel "$jobid" || true
+
+        while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do
+            local now
+            now=$(date +%s)
+            if (( now - start >= timeout )); then
+                echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s"
+                return 1
+            fi
+            echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..."
+            sleep "$interval"
+        done
+        echo "[scancel_sync] job $jobid exited"
+        return 0
+    }
+
+    set -x
+
+    export SLURM_ACCOUNT="$USER"
+    export SLURM_PARTITION="$PARTITION"
+    export SLURM_JOB_NAME="benchmark-sglang-disagg.job"
+
+    export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}"
+
+    # MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
+    if [[ -z "${MODEL_YAML_KEY:-}" ]]; then
+        export MODEL_YAML_KEY="${MODEL##*/}"
+    fi
+
+    # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container.
+    # Auto-resolved from HF hub cache layout so no symlink is needed.
+    if [[ -z "${MODEL_NAME:-}" ]]; then
+        _HF_DIR="models--$(echo "${MODEL}" | tr '/' '--')"
+        _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
+        if [[ -n "${_SNAPSHOT}" ]]; then
+            export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
+        else
+            export MODEL_NAME="${MODEL_YAML_KEY}"
+        fi
     fi
-"
-srun --jobid=$JOB_ID \
---container-image=$SQUASH_FILE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
---container-mount-home \
---container-writable \
---container-remap-root \
---container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh
-
-scancel $JOB_ID
+
+    export GPUS_PER_NODE=8
+
+    export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+    mkdir -p "$BENCHMARK_LOGS_DIR"
+    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+    JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
+
+    LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"
+
+    sleep 10
+
+    while ! ls "$LOG_FILE" &>/dev/null; do
+        if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then
+            echo "ERROR: Job $JOB_ID failed before creating log file"
+            scontrol show job "$JOB_ID"
+            exit 1
+        fi
+        sleep 5
+    done
+
+    set +x
+
+    (
+        while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do
+            sleep 10
+        done
+    ) &
+    POLL_PID=$!
+
+    tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+    wait $POLL_PID
+
+    set -x
+
+    cat > collect_latest_results.py <<'PY'
+import os, sys
+sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
+for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
+    print(path)
+PY
+
+    LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+    if [ -z "$LOGS_DIR" ]; then
+        echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
+        exit 1
+    fi
+
+    echo "Found logs directory: $LOGS_DIR"
+    ls -la "$LOGS_DIR"
+
+    for result_file in $(find $LOGS_DIR -type f); do
+        file_name=$(basename $result_file)
+        if [ -f $result_file ]; then
+            WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
+            echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
+            cp $result_file $WORKSPACE_RESULT_FILE
+        fi
+    done
+
+    echo "All result files processed"
+    set +x
+    scancel_sync $JOB_ID
+    set -x
+    echo "Canceled the slurm job $JOB_ID"
+
+    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+
+    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+        ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"
+        mkdir -p "$ARTIFACT_DIR"
+        cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true
+        echo "Logs copied to $ARTIFACT_DIR for artifact upload"
+    fi
+
+# =============================================================================
+# Single-node path: enroot via salloc + srun
+# =============================================================================
+else
+
+    SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    LOCK_FILE="${SQUASH_FILE}.lock"
+
+    set -x
+
+    JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+    if [ -z "$JOB_ID" ]; then
+        echo "ERROR: salloc failed to allocate a job"
+        exit 1
+    fi
+
+    srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
+    srun --jobid=$JOB_ID \
+    --container-image=$SQUASH_FILE \
+    --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+    --container-mount-home \
+    --container-writable \
+    --container-remap-root \
+    --container-workdir=/workspace/ \
+    --no-container-entrypoint --export=ALL \
+    bash benchmarks/single_node/${SCRIPT_NAME}
+
+    scancel $JOB_ID
+
+fi
diff --git a/scripts/manual-test-mi325x.sh b/scripts/manual-test-mi325x.sh
new file mode 100755
index 000000000..c232ded2a
--- /dev/null
+++ b/scripts/manual-test-mi325x.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+export GITHUB_WORKSPACE=$(pwd)
+export RUNNER_NAME=mi325x-amd-manual
+
+export MODEL=deepseek-ai/DeepSeek-R1-0528
+export EXP_NAME=dsr1_1k1k
+export PRECISION=fp8
+export FRAMEWORK=sglang-disagg
+
+export IMAGE=ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+
+export ISL=1024
+export OSL=1024
+export CONC_LIST="4 2 1"
+export SPEC_DECODING=none
+export RANDOM_RANGE_RATIO=1
+
+export PREFILL_NODES=1
+export PREFILL_NUM_WORKERS=1
+export PREFILL_TP=4
+export PREFILL_EP=1
+export PREFILL_DP_ATTN=false
+
+export DECODE_NODES=1
+export DECODE_NUM_WORKERS=1
+export DECODE_TP=8
+export DECODE_EP=1
+export DECODE_DP_ATTN=false
+
+bash runners/launch_mi325x-amd.sh
+
+#model files are here:
+#/nfsdata/sa/gharunner/gharunners/hf-hub-cache/models--deepseek-ai--DeepSeek-R1-0528
\ No newline at end of file

From 7b5047673f6e33a310754b85175f665fd9d5f08f Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan.nanos@gmail.com>
Date: Tue, 31 Mar 2026 10:36:30 -0700
Subject: [PATCH 02/41] Update amd-master.yaml

---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 6da2a4e22..815023c55 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1233,7 +1233,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
 
 dsr1-fp8-mi325x-sglang-disagg:
-  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt
+  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi325x-disagg

From b40908ca814db79aea0248c641b0ee359e09c762 Mon Sep 17 00:00:00 2001
From: root <root@slurm-sa-mi325-controller-02.ord.vultr.cpe.ice.amd.com>
Date: Tue, 31 Mar 2026 17:50:20 +0000
Subject: [PATCH 03/41] Add MTP config, expand sweep to full pareto frontier,
 use -good image

- Add dsr1-fp8-mi325x-sglang-disagg-mtp config with MTP=1/2 across
  all curve points (top/middle/bottom/low-conc) for both 1k/1k and 8k/1k
- Expand concurrency lists to cover full pareto frontier including
  non-optimal points
- Update image tag to v0.5.9-bnxt-good (the pushed image)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 152 ++++++++++++++++++++++++++++++++
 scripts/manual-test-mi325x.sh   |   2 +-
 2 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 815023c55..00b6a26de 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1380,3 +1380,155 @@ dsr1-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp8-mi325x-sglang-disagg-mtp:
+  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 768, 512, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=2"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=2"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024 ]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=2"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=2"
diff --git a/scripts/manual-test-mi325x.sh b/scripts/manual-test-mi325x.sh
index c232ded2a..30ec87d6a 100755
--- a/scripts/manual-test-mi325x.sh
+++ b/scripts/manual-test-mi325x.sh
@@ -15,7 +15,7 @@ export IMAGE=ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
 
 export ISL=1024
 export OSL=1024
-export CONC_LIST="4 2 1"
+export CONC_LIST="1024 512 256 128 64 32 16 8 4 2 1"
 export SPEC_DECODING=none
 export RANDOM_RANGE_RATIO=1
 

From 2421ca580cbb54491cd0bd12666ca1f660300908 Mon Sep 17 00:00:00 2001
From: root <root@slurm-sa-mi325-controller-02.ord.vultr.cpe.ice.amd.com>
Date: Tue, 31 Mar 2026 17:56:58 +0000
Subject: [PATCH 04/41] Add perf-changelog entry for MI325X disagg configs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 967edc19c..8e8ebc989 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,14 @@
+- config-keys:
+    - dsr1-fp8-mi325x-sglang-disagg
+    - dsr1-fp8-mi325x-sglang-disagg-mtp
+  description:
+    - "Add MI325X DeepSeek-R1 FP8 disaggregated inference with Broadcom Thor 2 IBGDA"
+    - "Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches"
+    - "Image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good"
+    - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k"
+    - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985
+
 - config-keys:
     - kimik2.5-int4-mi300x-vllm
   description:

From 6abdf85570d876220480a317ba9635451ccb055f Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Wed, 1 Apr 2026 00:19:24 +0000
Subject: [PATCH 05/41] Fix MI325X QoS detection and NFS-safe cleanup for
 disagg benchmarks

- Add chi-mi325x* hostname detection in env.sh for RDMA QoS config
  (MORI_RDMA_TC=104, MORI_RDMA_SL=3, derived from DCB DSCP AF31->prio 3)
  since nicctl is not available on Vultr/CPE MI325X hosts
- Wrap sudo rm -rf calls with timeout 30s in launch_mi325x-amd.sh and
  job.slurm to prevent indefinite hangs on stale NFS locks

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/env.sh    | 10 ++++++++++
 benchmarks/multi_node/amd_utils/job.slurm |  4 ++--
 runners/launch_mi325x-amd.sh              |  6 ++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 56572dfcf..99f2d0238 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -104,6 +104,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         elif [[ $NODENAME == mia1* ]]; then
             export MORI_RDMA_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+        elif [[ $NODENAME == chi-mi325x* ]]; then
+            # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+            export MORI_RDMA_TC=104
+            export MORI_RDMA_SL=3
+            echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
         else
             echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
         fi
@@ -117,6 +122,11 @@ else
     elif [[ $NODENAME == mia1* ]]; then
         export MORI_RDMA_TC=104
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+        export MORI_RDMA_TC=104
+        export MORI_RDMA_SL=3
+        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
     else
         echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
         echo "       This is normal for clusters without QoS or outside Docker containers."
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 0e8f465f5..784161d06 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -299,8 +299,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
 cleanup() {
   echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
+  # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+  timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
 
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 4e76c205a..a21d2fd58 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -74,7 +74,8 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
 
     export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
     mkdir -p "$BENCHMARK_LOGS_DIR"
-    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+    # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+    timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
     JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
 
@@ -137,7 +138,8 @@ PY
     set -x
     echo "Canceled the slurm job $JOB_ID"
 
-    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+    # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+    timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
 
     if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
         ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"

From 37162588f5776b1e4282577f801da8fd93d7448c Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Wed, 1 Apr 2026 06:44:31 +0000
Subject: [PATCH 06/41] Add local NVMe model caching for faster model loading

Pre-stage model weights from NFS/shared storage to local NVMe before
the inference server starts. Reduces model load time for large models
(e.g., DeepSeek-R1 ~340GB FP8) from NFS read speeds to NVMe speeds.

- utils/setup_local_nvme.sh: One-time NVMe setup script for compute nodes
  (format, mount, fstab entry). Supports single drive or RAID-0.
- utils/cache_model_locally.sh: Standalone/sourceable model caching utility
  using rsync with parallel blob sync for HF hub cache layout.
- job.slurm: When LOCAL_MODEL_CACHE_DIR is set, runs srun-based parallel
  rsync on all nodes before Docker starts. Idempotent (skips if cached).
  Falls back to shared storage if caching fails.
- launch_mi325x-amd.sh: Enable local caching at /local-nvme/models for
  MI325X cluster (8x 3.5TB NVMe per node).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm |  61 +++++++++++
 runners/launch_mi325x-amd.sh              |   5 +
 utils/cache_model_locally.sh              | 109 ++++++++++++++++++++
 utils/setup_local_nvme.sh                 | 118 ++++++++++++++++++++++
 4 files changed, 293 insertions(+)
 create mode 100755 utils/cache_model_locally.sh
 create mode 100755 utils/setup_local_nvme.sh

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 784161d06..7c746b41a 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -321,6 +321,67 @@ srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
     echo "NFS cache refreshed on $(hostname)"
 '
 
+# =============================================================================
+# Optional: Pre-stage model to local NVMe for faster loading
+# =============================================================================
+# LOCAL_MODEL_CACHE_DIR: mount point for fast local storage (NVMe/SSD) on compute nodes.
+# Set per-cluster via the runner/launch script. When set, model weights are rsync'd
+# from shared storage to local NVMe before Docker starts. This is idempotent —
+# subsequent runs skip files already cached locally.
+#
+# If unset or the local path doesn't exist, the model is served directly from
+# shared storage (NFS/Lustre) as before.
+if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then
+    LOCAL_MODEL_FULL="${LOCAL_MODEL_CACHE_DIR}/${MODEL_NAME}"
+    echo "[cache] Pre-staging model to local NVMe on all nodes..."
+    echo "[cache]   Source: $MODEL_PATH"
+    echo "[cache]   Dest:   $LOCAL_MODEL_FULL"
+
+    srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
+        set -euo pipefail
+        SRC="'"$MODEL_PATH"'"
+        DST="'"$LOCAL_MODEL_FULL"'"
+        CACHE_DIR="'"${LOCAL_MODEL_CACHE_DIR}"'"
+
+        # Create destination directory
+        sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR"
+        sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true
+
+        SRC_COUNT=$(find "$SRC" -type f 2>/dev/null | wc -l)
+        DST_COUNT=$(find "$DST" -type f 2>/dev/null | wc -l)
+
+        if [[ "$SRC_COUNT" -eq "$DST_COUNT" ]] && [[ "$DST_COUNT" -gt 0 ]]; then
+            echo "[cache] $(hostname): Already cached ($DST_COUNT files)"
+        else
+            echo "[cache] $(hostname): Syncing $SRC_COUNT files..."
+            START=$(date +%s)
+
+            if [[ -d "$SRC/blobs" ]]; then
+                # HuggingFace hub cache layout: parallel-sync large blobs
+                mkdir -p "$DST/blobs"
+                find "$SRC/blobs" -type f -printf "%f\n" | \
+                    xargs -P '"${CACHE_PARALLEL_JOBS:-4}"' -I{} \
+                    rsync -a --whole-file --ignore-existing "$SRC/blobs/{}" "$DST/blobs/{}"
+                rsync -a --whole-file --ignore-existing --exclude="blobs/" "$SRC/" "$DST/"
+            else
+                # Flat model directory
+                rsync -a --whole-file --ignore-existing "$SRC/" "$DST/"
+            fi
+
+            ELAPSED=$(( $(date +%s) - START ))
+            SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1)
+            echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)"
+        fi
+    ' 2>&1
+
+    if [[ $? -eq 0 ]]; then
+        echo "[cache] Model pre-staged successfully. Updating MODEL_DIR."
+        MODEL_DIR="${LOCAL_MODEL_CACHE_DIR}"
+    else
+        echo "[cache] WARNING: Local caching failed on some nodes. Falling back to shared storage."
+    fi
+fi
+
 srun \
   --nodelist="$SELECTED_NODELIST_SRUN" \
   --kill-on-bad-exit=1 \
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index a21d2fd58..107c68d7d 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -3,6 +3,11 @@
 export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
 export PORT=8888
 
+# Local NVMe cache for model weights (set to empty to disable)
+# MI325X nodes have 8x 3.5TB NVMe drives; /local-nvme must be set up
+# via: sudo bash utils/setup_local_nvme.sh /local-nvme
+export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}"
+
 PARTITION="compute"
 
 # Detect benchmark subdir from where the script lives
diff --git a/utils/cache_model_locally.sh b/utils/cache_model_locally.sh
new file mode 100755
index 000000000..37369d29e
--- /dev/null
+++ b/utils/cache_model_locally.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+# cache_model_locally.sh — Pre-stage model weights from shared storage to local NVMe.
+#
+# Syncs a model directory from NFS/shared storage to fast local NVMe before
+# the inference server starts, dramatically reducing model load time.
+#
+# Usage:
+#   source utils/cache_model_locally.sh
+#   cache_model_locally "/nfs/models/deepseek-r1" "/local-nvme/models/deepseek-r1"
+#
+# Or as a standalone script:
+#   bash utils/cache_model_locally.sh /nfs/models/deepseek-r1 /local-nvme/models/deepseek-r1
+#
+# Features:
+#   - Idempotent: skips files already present on the target
+#   - Preserves HuggingFace cache symlink structure
+#   - Concurrent execution safe (multiple nodes can cache simultaneously)
+#   - Configurable timeout to prevent NFS hangs
+#   - Works with both HF hub cache layout and flat model directories
+#
+# Environment variables:
+#   CACHE_PARALLEL_JOBS  — number of parallel rsync jobs for large blobs (default: 4)
+#   CACHE_TIMEOUT        — per-file timeout in seconds (default: 600)
+#   CACHE_DRY_RUN        — set to 1 to print what would be synced without copying
+
+set -euo pipefail
+
+CACHE_PARALLEL_JOBS="${CACHE_PARALLEL_JOBS:-4}"
+CACHE_TIMEOUT="${CACHE_TIMEOUT:-600}"
+CACHE_DRY_RUN="${CACHE_DRY_RUN:-0}"
+
+cache_model_locally() {
+    local src="${1:?Usage: cache_model_locally <source_path> <dest_path>}"
+    local dst="${2:?Usage: cache_model_locally <source_path> <dest_path>}"
+
+    if [[ ! -d "$src" ]]; then
+        echo "[cache] ERROR: Source path does not exist: $src" >&2
+        return 1
+    fi
+
+    # Quick check: if dest has the same number of regular files, skip entirely
+    local src_count dst_count
+    src_count=$(find "$src" -type f 2>/dev/null | wc -l)
+    dst_count=$(find "$dst" -type f 2>/dev/null | wc -l)
+
+    if [[ "$src_count" -eq "$dst_count" ]] && [[ "$dst_count" -gt 0 ]]; then
+        echo "[cache] Already cached: $dst ($dst_count files)"
+        echo "$dst"
+        return 0
+    fi
+
+    echo "[cache] Syncing model to local storage..."
+    echo "[cache]   Source: $src"
+    echo "[cache]   Dest:   $dst"
+    echo "[cache]   Parallel jobs: $CACHE_PARALLEL_JOBS"
+
+    mkdir -p "$dst"
+
+    local rsync_opts=(-a --whole-file --ignore-existing --info=name)
+    if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then
+        rsync_opts+=(--dry-run)
+    fi
+
+    local start_time
+    start_time=$(date +%s)
+
+    # Check if this is a HuggingFace hub cache directory (has blobs/ subdir)
+    if [[ -d "$src/blobs" ]]; then
+        echo "[cache] Detected HuggingFace hub cache layout"
+
+        # Step 1: Parallel-sync the large blob files (the actual model weights)
+        mkdir -p "$dst/blobs"
+        find "$src/blobs" -type f -printf '%f\n' | \
+            xargs -P "$CACHE_PARALLEL_JOBS" -I{} \
+            timeout "$CACHE_TIMEOUT" rsync "${rsync_opts[@]}" "$src/blobs/{}" "$dst/blobs/{}"
+
+        # Step 2: Sync everything else (symlinks in snapshots/, refs/, etc.) — fast
+        rsync "${rsync_opts[@]}" --exclude='blobs/' "$src/" "$dst/"
+    else
+        # Flat model directory: parallel-sync large files, then the rest
+        echo "[cache] Detected flat model directory"
+
+        # Sync large files (>100MB) in parallel
+        find "$src" -type f -size +100M -printf '%P\n' | \
+            xargs -P "$CACHE_PARALLEL_JOBS" -I{} bash -c \
+            'mkdir -p "$(dirname "'"$dst"'/{}")"; timeout '"$CACHE_TIMEOUT"' rsync '"$(printf '%q ' "${rsync_opts[@]}")"' "'"$src"'/{}" "'"$dst"'/{}"'
+
+        # Sync remaining small files and symlinks
+        rsync "${rsync_opts[@]}" "$src/" "$dst/"
+    fi
+
+    local elapsed=$(( $(date +%s) - start_time ))
+    local size
+    size=$(du -sh "$dst" 2>/dev/null | cut -f1)
+
+    echo "[cache] Done in ${elapsed}s — $size cached at $dst"
+    echo "$dst"
+    return 0
+}
+
+# If run as a standalone script (not sourced), execute with args
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    if [[ $# -lt 2 ]]; then
+        echo "Usage: $0 <source_model_path> <dest_model_path>" >&2
+        echo "  Env: CACHE_PARALLEL_JOBS=$CACHE_PARALLEL_JOBS CACHE_TIMEOUT=$CACHE_TIMEOUT" >&2
+        exit 1
+    fi
+    cache_model_locally "$1" "$2"
+fi
diff --git a/utils/setup_local_nvme.sh b/utils/setup_local_nvme.sh
new file mode 100755
index 000000000..03b81e8a4
--- /dev/null
+++ b/utils/setup_local_nvme.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+# setup_local_nvme.sh — Format and mount local NVMe drives for model caching.
+#
+# Detects unformatted/unmounted NVMe drives and sets up a mount point for
+# caching model weights locally. Designed to be run once per node (idempotent).
+#
+# Usage (run on each compute node, requires root):
+#   sudo bash utils/setup_local_nvme.sh [mount_point]
+#
+# Default mount point: /local-nvme
+#
+# This script:
+#   1. Finds the first available NVMe drive that is not the boot device
+#   2. Formats it with ext4 if not already formatted
+#   3. Mounts it at the specified mount point
+#   4. Adds an fstab entry for persistence across reboots
+#
+# For RAID-0 across multiple NVMe drives (maximum throughput), use:
+#   sudo bash utils/setup_local_nvme.sh --raid [mount_point]
+
+set -euo pipefail
+
+USE_RAID=false
+MOUNT_POINT="/local-nvme"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --raid) USE_RAID=true; shift ;;
+        *) MOUNT_POINT="$1"; shift ;;
+    esac
+done
+
+if [[ $EUID -ne 0 ]]; then
+    echo "ERROR: This script must be run as root (sudo)" >&2
+    exit 1
+fi
+
+echo "[nvme-setup] Mount point: $MOUNT_POINT"
+
+# Already mounted?
+if mountpoint -q "$MOUNT_POINT" 2>/dev/null; then
+    echo "[nvme-setup] $MOUNT_POINT is already mounted:"
+    df -h "$MOUNT_POINT"
+    exit 0
+fi
+
+# Find NVMe drives that are not part of the root filesystem
+ROOT_DEV=$(findmnt -n -o SOURCE / | sed 's/[0-9]*$//' | sed 's/p$//')
+NVME_DRIVES=()
+for dev in /dev/nvme*n1; do
+    [[ -b "$dev" ]] || continue
+    # Skip if this drive is part of root
+    if [[ "$dev" == "$ROOT_DEV"* ]]; then
+        echo "[nvme-setup] Skipping $dev (root device)"
+        continue
+    fi
+    # Skip if already mounted
+    if mount | grep -q "^$dev "; then
+        echo "[nvme-setup] Skipping $dev (already mounted)"
+        continue
+    fi
+    # Skip if part of an md array
+    if grep -q "$(basename "$dev")" /proc/mdstat 2>/dev/null; then
+        echo "[nvme-setup] Skipping $dev (part of md array)"
+        continue
+    fi
+    NVME_DRIVES+=("$dev")
+done
+
+if [[ ${#NVME_DRIVES[@]} -eq 0 ]]; then
+    echo "[nvme-setup] No available NVMe drives found."
+    exit 1
+fi
+
+echo "[nvme-setup] Found ${#NVME_DRIVES[@]} available NVMe drives: ${NVME_DRIVES[*]}"
+
+if [[ "$USE_RAID" == true ]] && [[ ${#NVME_DRIVES[@]} -gt 1 ]]; then
+    # RAID-0 for maximum throughput
+    MD_DEV="/dev/md10"
+    echo "[nvme-setup] Creating RAID-0 array across ${#NVME_DRIVES[@]} drives..."
+
+    if [[ -b "$MD_DEV" ]]; then
+        echo "[nvme-setup] $MD_DEV already exists, using it"
+    else
+        mdadm --create "$MD_DEV" --level=0 --raid-devices=${#NVME_DRIVES[@]} "${NVME_DRIVES[@]}" --run
+    fi
+
+    TARGET_DEV="$MD_DEV"
+else
+    # Single drive (use the first available)
+    TARGET_DEV="${NVME_DRIVES[0]}"
+    echo "[nvme-setup] Using single drive: $TARGET_DEV"
+fi
+
+# Format if needed
+if ! blkid "$TARGET_DEV" | grep -q 'TYPE="ext4"'; then
+    echo "[nvme-setup] Formatting $TARGET_DEV with ext4..."
+    mkfs.ext4 -F -L local-nvme "$TARGET_DEV"
+else
+    echo "[nvme-setup] $TARGET_DEV already has ext4 filesystem"
+fi
+
+# Mount
+mkdir -p "$MOUNT_POINT"
+mount -o noatime,discard "$TARGET_DEV" "$MOUNT_POINT"
+
+# Set permissions so non-root users can write
+chmod 1777 "$MOUNT_POINT"
+
+# Add fstab entry if not present
+if ! grep -q "$MOUNT_POINT" /etc/fstab; then
+    UUID=$(blkid -s UUID -o value "$TARGET_DEV")
+    echo "UUID=$UUID $MOUNT_POINT ext4 noatime,discard,nofail 0 2" >> /etc/fstab
+    echo "[nvme-setup] Added fstab entry"
+fi
+
+echo "[nvme-setup] Done:"
+df -h "$MOUNT_POINT"

From db677bd8fc2fc0a71ac57d67c35b88f6ccc06910 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Wed, 1 Apr 2026 06:54:51 +0000
Subject: [PATCH 07/41] Switch model caching from rsync to rclone sync

Use rclone sync with --transfers 32 --checkers 32 --links for
high-parallelism model pre-staging from NFS to local NVMe.
rclone is now installed on all MI325X compute nodes (v1.73.3).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm | 37 ++++--------
 utils/cache_model_locally.sh              | 71 ++++++-----------------
 2 files changed, 30 insertions(+), 78 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 7c746b41a..523bfd7c5 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -347,31 +347,18 @@ if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then
         sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR"
         sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true
 
-        SRC_COUNT=$(find "$SRC" -type f 2>/dev/null | wc -l)
-        DST_COUNT=$(find "$DST" -type f 2>/dev/null | wc -l)
-
-        if [[ "$SRC_COUNT" -eq "$DST_COUNT" ]] && [[ "$DST_COUNT" -gt 0 ]]; then
-            echo "[cache] $(hostname): Already cached ($DST_COUNT files)"
-        else
-            echo "[cache] $(hostname): Syncing $SRC_COUNT files..."
-            START=$(date +%s)
-
-            if [[ -d "$SRC/blobs" ]]; then
-                # HuggingFace hub cache layout: parallel-sync large blobs
-                mkdir -p "$DST/blobs"
-                find "$SRC/blobs" -type f -printf "%f\n" | \
-                    xargs -P '"${CACHE_PARALLEL_JOBS:-4}"' -I{} \
-                    rsync -a --whole-file --ignore-existing "$SRC/blobs/{}" "$DST/blobs/{}"
-                rsync -a --whole-file --ignore-existing --exclude="blobs/" "$SRC/" "$DST/"
-            else
-                # Flat model directory
-                rsync -a --whole-file --ignore-existing "$SRC/" "$DST/"
-            fi
-
-            ELAPSED=$(( $(date +%s) - START ))
-            SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1)
-            echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)"
-        fi
+        echo "[cache] $(hostname): Syncing model to local NVMe..."
+        START=$(date +%s)
+
+        rclone sync "$SRC/" "$DST/" \
+            --transfers 32 \
+            --checkers 32 \
+            --links \
+            --progress
+
+        ELAPSED=$(( $(date +%s) - START ))
+        SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1)
+        echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)"
     ' 2>&1
 
     if [[ $? -eq 0 ]]; then
diff --git a/utils/cache_model_locally.sh b/utils/cache_model_locally.sh
index 37369d29e..0b1480231 100755
--- a/utils/cache_model_locally.sh
+++ b/utils/cache_model_locally.sh
@@ -2,31 +2,30 @@
 # cache_model_locally.sh — Pre-stage model weights from shared storage to local NVMe.
 #
 # Syncs a model directory from NFS/shared storage to fast local NVMe before
-# the inference server starts, dramatically reducing model load time.
+# the inference server starts, using rclone for high-parallelism transfers.
 #
 # Usage:
 #   source utils/cache_model_locally.sh
-#   cache_model_locally "/nfs/models/deepseek-r1" "/local-nvme/models/deepseek-r1"
+#   cache_model_locally "/nfs/hub/models--org--repo" "/local-nvme/hub/models--org--repo"
 #
 # Or as a standalone script:
-#   bash utils/cache_model_locally.sh /nfs/models/deepseek-r1 /local-nvme/models/deepseek-r1
+#   bash utils/cache_model_locally.sh /nfs/hub/models--org--repo /local-nvme/hub/models--org--repo
 #
 # Features:
-#   - Idempotent: skips files already present on the target
-#   - Preserves HuggingFace cache symlink structure
-#   - Concurrent execution safe (multiple nodes can cache simultaneously)
-#   - Configurable timeout to prevent NFS hangs
+#   - Uses rclone sync with 32 parallel transfers for maximum throughput
+#   - Preserves HuggingFace cache symlink structure (--links)
+#   - Idempotent: rclone skips files already present and identical
 #   - Works with both HF hub cache layout and flat model directories
 #
 # Environment variables:
-#   CACHE_PARALLEL_JOBS  — number of parallel rsync jobs for large blobs (default: 4)
-#   CACHE_TIMEOUT        — per-file timeout in seconds (default: 600)
-#   CACHE_DRY_RUN        — set to 1 to print what would be synced without copying
+#   CACHE_TRANSFERS  — number of parallel rclone transfers (default: 32)
+#   CACHE_CHECKERS   — number of parallel rclone checkers (default: 32)
+#   CACHE_DRY_RUN    — set to 1 to print what would be synced without copying
 
 set -euo pipefail
 
-CACHE_PARALLEL_JOBS="${CACHE_PARALLEL_JOBS:-4}"
-CACHE_TIMEOUT="${CACHE_TIMEOUT:-600}"
+CACHE_TRANSFERS="${CACHE_TRANSFERS:-32}"
+CACHE_CHECKERS="${CACHE_CHECKERS:-32}"
 CACHE_DRY_RUN="${CACHE_DRY_RUN:-0}"
 
 cache_model_locally() {
@@ -38,57 +37,23 @@ cache_model_locally() {
         return 1
     fi
 
-    # Quick check: if dest has the same number of regular files, skip entirely
-    local src_count dst_count
-    src_count=$(find "$src" -type f 2>/dev/null | wc -l)
-    dst_count=$(find "$dst" -type f 2>/dev/null | wc -l)
-
-    if [[ "$src_count" -eq "$dst_count" ]] && [[ "$dst_count" -gt 0 ]]; then
-        echo "[cache] Already cached: $dst ($dst_count files)"
-        echo "$dst"
-        return 0
-    fi
-
     echo "[cache] Syncing model to local storage..."
     echo "[cache]   Source: $src"
     echo "[cache]   Dest:   $dst"
-    echo "[cache]   Parallel jobs: $CACHE_PARALLEL_JOBS"
+    echo "[cache]   Transfers: $CACHE_TRANSFERS, Checkers: $CACHE_CHECKERS"
 
     mkdir -p "$dst"
 
-    local rsync_opts=(-a --whole-file --ignore-existing --info=name)
-    if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then
-        rsync_opts+=(--dry-run)
-    fi
-
     local start_time
     start_time=$(date +%s)
 
-    # Check if this is a HuggingFace hub cache directory (has blobs/ subdir)
-    if [[ -d "$src/blobs" ]]; then
-        echo "[cache] Detected HuggingFace hub cache layout"
-
-        # Step 1: Parallel-sync the large blob files (the actual model weights)
-        mkdir -p "$dst/blobs"
-        find "$src/blobs" -type f -printf '%f\n' | \
-            xargs -P "$CACHE_PARALLEL_JOBS" -I{} \
-            timeout "$CACHE_TIMEOUT" rsync "${rsync_opts[@]}" "$src/blobs/{}" "$dst/blobs/{}"
-
-        # Step 2: Sync everything else (symlinks in snapshots/, refs/, etc.) — fast
-        rsync "${rsync_opts[@]}" --exclude='blobs/' "$src/" "$dst/"
-    else
-        # Flat model directory: parallel-sync large files, then the rest
-        echo "[cache] Detected flat model directory"
-
-        # Sync large files (>100MB) in parallel
-        find "$src" -type f -size +100M -printf '%P\n' | \
-            xargs -P "$CACHE_PARALLEL_JOBS" -I{} bash -c \
-            'mkdir -p "$(dirname "'"$dst"'/{}")"; timeout '"$CACHE_TIMEOUT"' rsync '"$(printf '%q ' "${rsync_opts[@]}")"' "'"$src"'/{}" "'"$dst"'/{}"'
-
-        # Sync remaining small files and symlinks
-        rsync "${rsync_opts[@]}" "$src/" "$dst/"
+    local rclone_opts=(--transfers "$CACHE_TRANSFERS" --checkers "$CACHE_CHECKERS" --links --progress)
+    if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then
+        rclone_opts+=(--dry-run)
     fi
 
+    rclone sync "$src/" "$dst/" "${rclone_opts[@]}"
+
     local elapsed=$(( $(date +%s) - start_time ))
     local size
     size=$(du -sh "$dst" 2>/dev/null | cut -f1)
@@ -102,7 +67,7 @@ cache_model_locally() {
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
     if [[ $# -lt 2 ]]; then
         echo "Usage: $0 <source_model_path> <dest_model_path>" >&2
-        echo "  Env: CACHE_PARALLEL_JOBS=$CACHE_PARALLEL_JOBS CACHE_TIMEOUT=$CACHE_TIMEOUT" >&2
+        echo "  Env: CACHE_TRANSFERS=$CACHE_TRANSFERS CACHE_CHECKERS=$CACHE_CHECKERS" >&2
         exit 1
     fi
     cache_model_locally "$1" "$2"

From 0a485de74d254f2dab6445b4891f69630a816872 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Wed, 1 Apr 2026 07:30:13 +0000
Subject: [PATCH 08/41] Add MTP baseline to single-node MI325X DeepSeek-R1 FP8
 config

Add spec-decoding: mtp search space entries alongside the existing
non-MTP entries for both 1k/1k and 8k/1k sequence length configs.
This provides a single-node MTP baseline for comparison with the
disaggregated multi-node MTP results.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 00b6a26de..1ab86b8af 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -89,10 +89,12 @@ dsr1-fp8-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 dsr1-fp8-mi355x-sglang:
   image: lmsysorg/sglang:v0.5.9-rocm700-mi35x

From 67dec7cfd088e7f8fb82afec88e1a3a21190c3b7 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Wed, 1 Apr 2026 07:36:09 +0000
Subject: [PATCH 09/41] Split MI325X single-node MTP into separate config key

Separate dsr1-fp8-mi325x-sglang-mtp from the base config so it can
be swept independently. Full sweeps still cover both via their
respective config keys.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 1ab86b8af..9fb5c53a0 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -89,11 +89,27 @@ dsr1-fp8-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 dsr1-fp8-mi355x-sglang:

From f18257f6abba974895693a6b0dce6363234d3bbe Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 2 Apr 2026 03:39:54 +0000
Subject: [PATCH 10/41] Fix MI325X single-node script resolution and add MTP
 support

The launcher's script name pattern included _${FRAMEWORK} suffix, but
single-node scripts don't use framework suffixes (only multi-node disagg
scripts do). This broke all MI325X single-node configs. Fix by trying
framework-suffixed name for multi-node first, then falling back to the
base name for single-node.

Also add MTP speculative decoding support to the existing dsr1_fp8_mi325x.sh
script and update perf-changelog with the single-node config keys.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsr1_fp8_mi325x.sh | 13 +++++++++++--
 perf-changelog.yaml                       | 10 +++++++---
 runners/launch_mi325x-amd.sh              | 15 ++++++++++-----
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index ae1e930f0..54760882b 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -26,6 +26,14 @@ hf download $MODEL
 export SGLANG_USE_AITER=1
 export SGLANG_AITER_MLA_PERSIST=1
 
+# MTP (speculative decoding) flags
+MTP_ARGS=""
+CHAT_TEMPLATE_ARGS=""
+if [[ "${SPEC_DECODING:-}" == "mtp" ]]; then
+    MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 1 --speculative-num-draft-tokens 2"
+    CHAT_TEMPLATE_ARGS="--use-chat-template"
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -47,7 +55,7 @@ python3 -m sglang.launch_server \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
 --disable-radix-cache \
-$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+$MTP_ARGS $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -64,7 +72,8 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    $CHAT_TEMPLATE_ARGS
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8e8ebc989..d059c439b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,10 +1,14 @@
 - config-keys:
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi325x-sglang-mtp
     - dsr1-fp8-mi325x-sglang-disagg
     - dsr1-fp8-mi325x-sglang-disagg-mtp
   description:
-    - "Add MI325X DeepSeek-R1 FP8 disaggregated inference with Broadcom Thor 2 IBGDA"
-    - "Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches"
-    - "Image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good"
+    - "Add MI325X DeepSeek-R1 FP8 single-node and disaggregated inference with Broadcom Thor 2 IBGDA"
+    - "Single-node: SGLang with aiter backend, MLA persist kernel, TP8, FP8 KV cache"
+    - "Disaggregated: Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches"
+    - "Image (disagg): ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good"
+    - "Image (single-node): lmsysorg/sglang:v0.5.9-rocm700-mi30x"
     - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k"
     - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 107c68d7d..6ac64f3d8 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -10,14 +10,19 @@ export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}"
 
 PARTITION="compute"
 
-# Detect benchmark subdir from where the script lives
-SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh"
-if [[ -f "benchmarks/multi_node/${SCRIPT_NAME}" ]]; then
+# Detect benchmark subdir from where the script lives.
+# Multi-node scripts include the framework suffix (e.g. _sglang-disagg.sh);
+# single-node scripts do not (e.g. dsr1_fp8_mi325x.sh).
+SCRIPT_NAME_WITH_FW="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh"
+SCRIPT_NAME_BASE="${EXP_NAME%%_*}_${PRECISION}_mi325x.sh"
+if [[ -f "benchmarks/multi_node/${SCRIPT_NAME_WITH_FW}" ]]; then
     BENCHMARK_SUBDIR="multi_node"
-elif [[ -f "benchmarks/single_node/${SCRIPT_NAME}" ]]; then
+    SCRIPT_NAME="${SCRIPT_NAME_WITH_FW}"
+elif [[ -f "benchmarks/single_node/${SCRIPT_NAME_BASE}" ]]; then
     BENCHMARK_SUBDIR="single_node"
+    SCRIPT_NAME="${SCRIPT_NAME_BASE}"
 else
-    echo "ERROR: ${SCRIPT_NAME} not found in benchmarks/multi_node or benchmarks/single_node"
+    echo "ERROR: neither benchmarks/multi_node/${SCRIPT_NAME_WITH_FW} nor benchmarks/single_node/${SCRIPT_NAME_BASE} found"
     exit 1
 fi
 

From 3ccfba3b1e0ea6146b2b760e30520df5d85332f8 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 2 Apr 2026 05:39:28 +0000
Subject: [PATCH 11/41] Fix decode dispatch token limit for DP attention disagg
 configs

SGLang's DP attention mode overrides chunked_prefill_size to 1024,
which must be <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. The
default MORI_MAX_DISPATCH_TOKENS_DECODE of 160 is too small, causing
an assertion failure on all EP8/DP decode configs (both MI325X and
MI355X). Bump to 1024 when DP attention is enabled.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index b477790b3..960cbb6e7 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -213,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
 fi
 
+# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be
+# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch
+# token limit when DP is enabled to satisfy this assertion.
+if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then
+    MORI_MAX_DISPATCH_TOKENS_DECODE=1024
+fi
+
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================

From 021303211d473fa1ac25dbe870b14429459e1796 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 2 Apr 2026 18:15:38 +0000
Subject: [PATCH 12/41] Disable EP8/DP disagg configs on MI325X and bump MTP to
 3 tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with
Broadcom Thor 2 NICs) — servers start but all requests fail.
Comment out these search-space entries for now.

Bump DECODE_MTP_SIZE from 2 to 3 and speculative-num-steps from 1
to 3 for better low-concurrency decode throughput on CDNA3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml           | 232 +++++++++++-----------
 benchmarks/single_node/dsr1_fp8_mi325x.sh |   2 +-
 2 files changed, 119 insertions(+), 115 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9fb5c53a0..68d2aec6e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1263,43 +1263,44 @@ dsr1-fp8-mi325x-sglang-disagg:
   - isl: 1024
     osl: 1024
     search-space:
-    # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 768, 512, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 768, 512, 256 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
     - spec-decoding: "none"
@@ -1342,24 +1343,25 @@ dsr1-fp8-mi325x-sglang-disagg:
   - isl: 8192
     osl: 1024
     search-space:
-    # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
     - spec-decoding: "none"
@@ -1414,43 +1416,44 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
     osl: 1024
     search-space:
     # MTP configurations
-    # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 768, 512, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 768, 512, 256 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=1"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
     - spec-decoding: "mtp"
@@ -1469,7 +1472,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=3"
 
     # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "mtp"
@@ -1488,30 +1491,31 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=3"
 
   - isl: 8192
     osl: 1024
     search-space:
     # MTP configurations
-    # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=1"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
     - spec-decoding: "mtp"
@@ -1530,7 +1534,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=3"
 
     # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "mtp"
@@ -1549,4 +1553,4 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=3"
diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index 54760882b..dc594a854 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -30,7 +30,7 @@ export SGLANG_AITER_MLA_PERSIST=1
 MTP_ARGS=""
 CHAT_TEMPLATE_ARGS=""
 if [[ "${SPEC_DECODING:-}" == "mtp" ]]; then
-    MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 1 --speculative-num-draft-tokens 2"
+    MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 3 --speculative-num-draft-tokens 4"
     CHAT_TEMPLATE_ARGS="--use-chat-template"
 fi
 

From 2afb24a7c008e9f0b92d7509da43c194262d9014 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 2 Apr 2026 19:17:47 +0000
Subject: [PATCH 13/41] Add single-node EP8/DP test configs for MI325X disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds P(tp4) → D(tp8/ep8/dp, 1 node) search-space entries for both
non-MTP and MTP disagg configs. This isolates whether EP/DP itself
is broken on MI325X or if only the multi-node distributed init
hangs with Broadcom Thor 2 NICs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 39 +++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 68d2aec6e..adacf0203 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1340,6 +1340,26 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
+    # Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X
+    # or if only the multi-node distributed init is broken with Broadcom Thor 2)
+    - spec-decoding: "none"
+      conc-list: [ 512, 256, 128, 64 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
   - isl: 8192
     osl: 1024
     search-space:
@@ -1493,6 +1513,25 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
 
+    # Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init)
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 256, 128, 64 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
   - isl: 8192
     osl: 1024
     search-space:

From 36aebfd083760cb06db29b61a10560e5a9004f81 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 3 Apr 2026 20:11:24 +0000
Subject: [PATCH 14/41] Move container image to semianalysiswork Docker Hub and
 fix launcher bugs

- Retag disagg image from ghcr.io/jordannanos/sgl-mi325x-mori to
  semianalysiswork/sgl-cdna3-mori (unified name for MI300X/MI325X,
  both gfx942 CDNA3)
- Add empty JOB_ID guard in multi-node launcher path
- Fix grep to use -qx for exact job ID matching (prevents substring
  collisions with sibling SLURM jobs)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 4 ++--
 perf-changelog.yaml             | 2 +-
 runners/launch_mi325x-amd.sh    | 9 +++++++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index adacf0203..5c9d6ff41 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1251,7 +1251,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
 
 dsr1-fp8-mi325x-sglang-disagg:
-  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi325x-disagg
@@ -1423,7 +1423,7 @@ dsr1-fp8-mi325x-sglang-disagg:
 
 
 dsr1-fp8-mi325x-sglang-disagg-mtp:
-  image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi325x-disagg
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d059c439b..e8f5c3bab 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -7,7 +7,7 @@
     - "Add MI325X DeepSeek-R1 FP8 single-node and disaggregated inference with Broadcom Thor 2 IBGDA"
     - "Single-node: SGLang with aiter backend, MLA persist kernel, TP8, FP8 KV cache"
     - "Disaggregated: Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches"
-    - "Image (disagg): ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good"
+    - "Image (disagg): semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt"
     - "Image (single-node): lmsysorg/sglang:v0.5.9-rocm700-mi30x"
     - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k"
     - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker"
diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 6ac64f3d8..89aa99934 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -89,12 +89,17 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
 
     JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
 
+    if [[ -z "$JOB_ID" ]]; then
+        echo "ERROR: benchmark script produced no job ID"
+        exit 1
+    fi
+
     LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out"
 
     sleep 10
 
     while ! ls "$LOG_FILE" &>/dev/null; do
-        if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then
+        if ! squeue -u "$USER" --noheader --format='%i' | grep -qx "$JOB_ID"; then
             echo "ERROR: Job $JOB_ID failed before creating log file"
             scontrol show job "$JOB_ID"
             exit 1
@@ -105,7 +110,7 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
     set +x
 
     (
-        while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do
+        while squeue -u $USER --noheader --format='%i' | grep -qx "$JOB_ID"; do
             sleep 10
         done
     ) &

From b5a0bc2a653ac4a848cbd8a23c8c69b5417c05bd Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Sat, 4 Apr 2026 05:05:18 +0000
Subject: [PATCH 15/41] Test EP8/DP workaround: drop MoRI a2a backend on MI325X
 bnxt_re

Add DeepSeek-R1-0528-bnxt model config in models.yaml that removes
--moe-a2a-backend mori from dp_flags while keeping MoRI for KV cache
transfer. The EP8/DP test configs now use MODEL_YAML_KEY override to
select this config, testing whether EP/DP works with default a2a
kernels on Broadcom Thor 2 NICs.

See sgl-project/sglang#22072

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml             | 10 ++++--
 benchmarks/multi_node/amd_utils/models.yaml | 34 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5c9d6ff41..1d7e304a8 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1340,8 +1340,8 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-    # Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X
-    # or if only the multi-node distributed init is broken with Broadcom Thor 2)
+    # Single-node EP8/DP decode — workaround: use default a2a kernels instead of
+    # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072
     - spec-decoding: "none"
       conc-list: [ 512, 256, 128, 64 ]
       prefill:
@@ -1351,6 +1351,7 @@ dsr1-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
       decode:
         num-worker: 1
         tp: 8
@@ -1359,6 +1360,7 @@ dsr1-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
 
   - isl: 8192
     osl: 1024
@@ -1513,7 +1515,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
 
-    # Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init)
+    # Single-node EP8/DP decode with MTP — workaround: default a2a kernels
     - spec-decoding: "mtp"
       conc-list: [ 512, 256, 128, 64 ]
       prefill:
@@ -1523,6 +1525,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
       decode:
         num-worker: 1
         tp: 8
@@ -1531,6 +1534,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         additional-settings:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
 
   - isl: 8192
     osl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 2bbdd91d6..60817e12e 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -130,6 +130,40 @@ DeepSeek-R1:
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
 
+# Workaround for MI325X Broadcom Thor 2 (bnxt_re): drop --moe-a2a-backend mori
+# from dp_flags to test if EP/DP works with default a2a kernels while keeping
+# MoRI for KV cache transfer. See sgl-project/sglang#22072
+DeepSeek-R1-0528-bnxt:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
 DeepSeek-R1-0528:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"

From beb3808bef9f39010b499ec173ca1d0149198655 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Sat, 4 Apr 2026 06:12:35 +0000
Subject: [PATCH 16/41] Fix MODEL_NAME for EP8/DP test configs with
 MODEL_YAML_KEY override

MODEL_NAME falls back to MODEL_YAML_KEY when HF cache lookup fails
(pre-existing tr bug with double-dash in org--repo). Explicitly set
MODEL_NAME=DeepSeek-R1-0528 in additional-settings so the model path
resolves correctly on disk.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 1d7e304a8..7b28b5447 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1352,6 +1352,7 @@ dsr1-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
       decode:
         num-worker: 1
         tp: 8
@@ -1361,6 +1362,7 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
 
   - isl: 8192
     osl: 1024
@@ -1526,6 +1528,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
       decode:
         num-worker: 1
         tp: 8
@@ -1535,6 +1538,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
 
   - isl: 8192
     osl: 1024

From 23c293154c56009756cde8eadc9a99e92f23a07b Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan@semianalysis.com>
Date: Sat, 4 Apr 2026 07:13:17 +0000
Subject: [PATCH 17/41] fix: resolve MODEL_NAME from flat repo dir when HF
 snapshot absent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When MODEL_YAML_KEY differs from the actual model directory name
(e.g. DeepSeek-R1-0528-bnxt key → DeepSeek-R1-0528 dir), the old
fallback to MODEL_YAML_KEY produced a non-existent path, causing the
EP8/DP workaround jobs to fail the model-availability check before
even starting.

The MI325X cluster stores models as flat directories named after the
HF repo (e.g. hf-hub-cache/DeepSeek-R1-0528), not in HF hub cache
snapshot layout. Add an intermediate check for that flat-dir format
before falling back to MODEL_YAML_KEY.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 runners/launch_mi325x-amd.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 89aa99934..41b354705 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -75,6 +75,11 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
         _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
         if [[ -n "${_SNAPSHOT}" ]]; then
             export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
+        elif [[ -d "${MODEL_PATH}/${MODEL##*/}" ]]; then
+            # Cluster stores models as flat dirs named after the repo (e.g. DeepSeek-R1-0528),
+            # not in HF hub cache layout. Use repo name so MODEL_YAML_KEY can differ from
+            # the path (e.g. DeepSeek-R1-0528-bnxt yaml key → DeepSeek-R1-0528 dir).
+            export MODEL_NAME="${MODEL##*/}"
         else
             export MODEL_NAME="${MODEL_YAML_KEY}"
         fi

From e5b9d00b8d97670f3b40408aca0ac5f9e5484a1c Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Sat, 4 Apr 2026 15:36:28 +0000
Subject: [PATCH 18/41] Tune EP8/DP test: lower concurrency + QP params for SQ
 full fix

Previous EP8/DP test hit RDMA SQ overflow (max=4351) at high concurrency.
Reduce conc to 4-64, set MORI_IO_QP_MAX_SEND_WR=4096 to stay within
Broadcom bnxt_re hardware limits, and lower MORI_MAX_DISPATCH_TOKENS_DECODE.
Disable MTP EP8/DP test (HSA GPU fault) to focus on non-MTP first.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 54 ++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7b28b5447..8f33d1116 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1340,10 +1340,10 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-    # Single-node EP8/DP decode — workaround: use default a2a kernels instead of
-    # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072
+    # Single-node EP8/DP decode — workaround: no MoRI a2a, reduced concurrency
+    # to avoid RDMA SQ overflow on Broadcom bnxt_re. See sgl-project/sglang#22072
     - spec-decoding: "none"
-      conc-list: [ 512, 256, 128, 64 ]
+      conc-list: [ 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -1363,6 +1363,9 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
         - "MODEL_NAME=DeepSeek-R1-0528"
+        - "MORI_MAX_DISPATCH_TOKENS_DECODE=160"
+        - "MORI_IO_QP_MAX_SEND_WR=4096"
+        - "MORI_IO_QP_MAX_CQE=8192"
 
   - isl: 8192
     osl: 1024
@@ -1517,28 +1520,29 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
 
-    # Single-node EP8/DP decode with MTP — workaround: default a2a kernels
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 256, 128, 64 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-        - "MODEL_NAME=DeepSeek-R1-0528"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=3"
-        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-        - "MODEL_NAME=DeepSeek-R1-0528"
+    # DISABLED: MTP + EP8/DP hits HSA hardware exception (GPU kernel fault) on CDNA3.
+    # Focus on getting non-MTP EP8/DP working first.
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 64, 32, 16, 8, 4 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 4
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #     - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+    #     - "MODEL_NAME=DeepSeek-R1-0528"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=3"
+    #     - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+    #     - "MODEL_NAME=DeepSeek-R1-0528"
 
   - isl: 8192
     osl: 1024

From 76d89d0f7b39d118d41be66d8b33fb10091d14e0 Mon Sep 17 00:00:00 2001
From: Jordan Nanos <jordan@semianalysis.com>
Date: Sun, 5 Apr 2026 02:12:34 +0000
Subject: [PATCH 19/41] fix: lower bnxt_re QP limits and concurrency for MI325X
 EP8/DP disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Broadcom Thor 2 (bnxt_re) NICs cap SQ depth at ~4351 entries. The upstream
MORI defaults (SEND_WR=16384, CQE=32768) cause SQ overflow under EP8 RDMA
traffic. Lower to SEND_WR=4096 / CQE=8192 per sgl-project/sglang#22072.

Also cap EP8/DP workaround concurrency at 64 (from 512) — bnxt_re saturates
the SQ at higher concurrency with EP8's 8x RDMA traffic multiplier.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml        | 54 +++++++++++++-------------
 benchmarks/multi_node/amd_utils/env.sh |  7 +++-
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8f33d1116..c6aa404a0 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1340,8 +1340,9 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-    # Single-node EP8/DP decode — workaround: no MoRI a2a, reduced concurrency
-    # to avoid RDMA SQ overflow on Broadcom bnxt_re. See sgl-project/sglang#22072
+    # Single-node EP8/DP decode — workaround: use default a2a kernels instead of
+    # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072
+    # Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8.
     - spec-decoding: "none"
       conc-list: [ 64, 32, 16, 8, 4 ]
       prefill:
@@ -1363,9 +1364,6 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
         - "MODEL_NAME=DeepSeek-R1-0528"
-        - "MORI_MAX_DISPATCH_TOKENS_DECODE=160"
-        - "MORI_IO_QP_MAX_SEND_WR=4096"
-        - "MORI_IO_QP_MAX_CQE=8192"
 
   - isl: 8192
     osl: 1024
@@ -1520,29 +1518,29 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
 
-    # DISABLED: MTP + EP8/DP hits HSA hardware exception (GPU kernel fault) on CDNA3.
-    # Focus on getting non-MTP EP8/DP working first.
-    # - spec-decoding: "mtp"
-    #   conc-list: [ 64, 32, 16, 8, 4 ]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 4
-    #     ep: 1
-    #     dp-attn: false
-    #     additional-settings:
-    #     - "PREFILL_NODES=1"
-    #     - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-    #     - "MODEL_NAME=DeepSeek-R1-0528"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "DECODE_NODES=1"
-    #     - "DECODE_MTP_SIZE=3"
-    #     - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-    #     - "MODEL_NAME=DeepSeek-R1-0528"
+    # Single-node EP8/DP decode with MTP — workaround: default a2a kernels
+    # Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8.
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
 
   - isl: 8192
     osl: 1024
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 99f2d0238..9e2942ecf 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -67,8 +67,11 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
 
 export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
-export MORI_IO_QP_MAX_SEND_WR=16384
-export MORI_IO_QP_MAX_CQE=32768
+# Broadcom bnxt_re NICs cap SQ depth at ~4351 entries. Lower from upstream
+# defaults (16384/32768) to avoid SQ overflow under EP8 RDMA traffic.
+# See sgl-project/sglang#22072
+export MORI_IO_QP_MAX_SEND_WR=4096
+export MORI_IO_QP_MAX_CQE=8192
 export MORI_IO_QP_MAX_SGE=4
 
 export MORI_APP_LOG_LEVEL=INFO

From 4d9ee30dbfb0e1bab10505cace365397f0305844 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 9 Apr 2026 17:31:57 +0000
Subject: [PATCH 20/41] Add GLM-5 FP8 single-node benchmark for MI325X

SGLang with NSA tilelang backends, TP8, conc 4-64.
Based on MI355X GLM-5 script with MI325X aiter/MLA persist tuning.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml           | 18 +++++
 benchmarks/single_node/glm5_fp8_mi325x.sh | 80 +++++++++++++++++++++++
 perf-changelog.yaml                       |  8 +++
 3 files changed, 106 insertions(+)
 create mode 100755 benchmarks/single_node/glm5_fp8_mi325x.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index c6aa404a0..2b5479564 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -112,6 +112,24 @@ dsr1-fp8-mi325x-sglang-mtp:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
+glm5-fp8-mi325x-sglang:
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
 dsr1-fp8-mi355x-sglang:
   image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
   model: deepseek-ai/DeepSeek-R1-0528
diff --git a/benchmarks/single_node/glm5_fp8_mi325x.sh b/benchmarks/single_node/glm5_fp8_mi325x.sh
new file mode 100755
index 000000000..1a35d7fc8
--- /dev/null
+++ b/benchmarks/single_node/glm5_fp8_mi325x.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+# GLM-5 requires transformers with glm_moe_dsa model type support.
+python3 -m pip install -U --no-cache-dir \
+  "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2"
+
+hf download "$MODEL"
+
+# ROCm / SGLang performance tuning for MI325X
+export SGLANG_USE_AITER=1
+export SGLANG_AITER_MLA_PERSIST=1
+export SGLANG_ROCM_FUSED_DECODE_MLA=0
+export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export SAFETENSORS_FAST_GPU=1
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+python3 -m sglang.launch_server \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --trust-remote-code \
+    --tool-call-parser glm47 \
+    --reasoning-parser glm45 \
+    --mem-fraction-static 0.85 \
+    --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
+    --nsa-prefill-backend tilelang \
+    --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e8f5c3bab..9eab938a9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -13,6 +13,14 @@
     - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985
 
+- config-keys:
+    - glm5-fp8-mi325x-sglang
+  description:
+    - "Add GLM-5 FP8 single-node MI325X SGLang benchmark (TP8)"
+    - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends"
+    - "Image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985
+
 - config-keys:
     - kimik2.5-int4-mi300x-vllm
   description:

From 13c11670d4d52a2b1edfd0bdca7e71536152372c Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 9 Apr 2026 20:18:30 +0000
Subject: [PATCH 21/41] Skip HF download validation when model is cached on
 MI325X

Avoids stale NFS file lock errors when multiple runners hit the
shared HF cache simultaneously. Sets HF_HUB_OFFLINE=1 if the
model directory already exists.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/glm5_fp8_mi325x.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/glm5_fp8_mi325x.sh b/benchmarks/single_node/glm5_fp8_mi325x.sh
index 1a35d7fc8..5ff4b9e5a 100755
--- a/benchmarks/single_node/glm5_fp8_mi325x.sh
+++ b/benchmarks/single_node/glm5_fp8_mi325x.sh
@@ -19,7 +19,13 @@ fi
 python3 -m pip install -U --no-cache-dir \
   "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2"
 
-hf download "$MODEL"
+# Skip HF hub validation if model is already cached (avoids stale NFS lock issues)
+if [[ -d "${HF_HUB_CACHE}/models--$(echo "$MODEL" | tr '/' '--')" ]]; then
+    echo "Model already cached, skipping download"
+    export HF_HUB_OFFLINE=1
+else
+    hf download "$MODEL"
+fi
 
 # ROCm / SGLang performance tuning for MI325X
 export SGLANG_USE_AITER=1

From d4d6e1986307b15a4b3c57f79562ab489da67b02 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 9 Apr 2026 22:50:35 +0000
Subject: [PATCH 22/41] Add Qwen3.5 and GLM-5 FP8 disaggregated inference for
 MI325X

Port MI355X disagg configs (from chun-chang/sglang-disagg-qwen3.5) to
MI325X with Broadcom Thor 2 adaptations:
- TP-only configs (1P2D TP8, 1P1D TP4/TP8) enabled
- EP8/DP multi-node configs commented out (MoRI a2a hangs on bnxt_re)
- Single-node EP8/DP workaround included (no MoRI a2a, low conc)
- Model entries in models.yaml + bnxt variants for EP/DP workaround
- GLM-5 transformers patch and env tuning in server.sh/env.sh

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 354 ++++++++++++++++++
 benchmarks/multi_node/amd_utils/env.sh        |   7 +
 benchmarks/multi_node/amd_utils/models.yaml   | 125 +++++++
 benchmarks/multi_node/amd_utils/server.sh     |   7 +
 .../glm5_fp8_mi325x_sglang-disagg.sh          |  82 ++++
 .../qwen3.5_fp8_mi325x_sglang-disagg.sh       |  82 ++++
 perf-changelog.yaml                           |   7 +-
 7 files changed, 662 insertions(+), 2 deletions(-)
 create mode 100755 benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh
 create mode 100755 benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2b5479564..25bcf30ca 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1621,3 +1621,357 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         additional-settings:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
+
+qwen3.5-fp8-mi325x-sglang-disagg:
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP configs — MoRI a2a hangs on Broadcom bnxt_re (sgl-project/sglang#22072)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP16)
+    # - spec-decoding: "none"
+    #   conc-list: [ 1024, 2048 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 1536, 1024, 512 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # Single-node EP8/DP workaround: no MoRI a2a, low conc
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
+        - "MORI_MAX_DISPATCH_TOKENS_DECODE=160"
+        - "MORI_IO_QP_MAX_SEND_WR=4096"
+        - "MORI_IO_QP_MAX_CQE=8192"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP (sgl-project/sglang#22072)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 1024, 2048 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+glm5-fp8-mi325x-sglang-disagg:
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP configs — MoRI a2a hangs on Broadcom bnxt_re (sgl-project/sglang#22072)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP16)
+    # - spec-decoding: "none"
+    #   conc-list: [ 1024, 2048 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 1536, 1024, 512 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # Single-node EP8/DP workaround: no MoRI a2a, low conc
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
+        - "MODEL_NAME=GLM-5-FP8"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
+        - "MODEL_NAME=GLM-5-FP8"
+        - "MORI_MAX_DISPATCH_TOKENS_DECODE=160"
+        - "MORI_IO_QP_MAX_SEND_WR=4096"
+        - "MORI_IO_QP_MAX_CQE=8192"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP (sgl-project/sglang#22072)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 1024, 2048 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 9e2942ecf..1d0e376e5 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -45,6 +45,13 @@ export SGLANG_USE_AITER=1
 export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200
 
+# GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading
+if [[ "$MODEL_NAME" == *GLM-5* ]]; then
+    export SGLANG_ROCM_FUSED_DECODE_MLA=0
+    export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+    export SAFETENSORS_FAST_GPU=1
+fi
+
 # Disable allocating memory in one pass
 export MORI_SHMEM_MODE=ISOLATION
 export SGLANG_MORI_FP8_DISP=True
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 60817e12e..e2d5e78eb 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -256,3 +256,128 @@ DeepSeek-R1-0528-MXFP4:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+Qwen3.5-397B-A17B-FP8:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: ""
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+# MI325X bnxt workaround: no MoRI a2a for EP/DP (sgl-project/sglang#22072)
+Qwen3.5-397B-A17B-FP8-bnxt:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: ""
+  dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+GLM-5-FP8:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\"enable_multithread_load\": true, \"num_threads\": 8}'"
+  mtp_flags: ""
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+# MI325X bnxt workaround: no MoRI a2a for EP/DP (sgl-project/sglang#22072)
+GLM-5-FP8-bnxt:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\"enable_multithread_load\": true, \"num_threads\": 8}'"
+  mtp_flags: ""
+  dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 960cbb6e7..1384d6c9d 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -11,6 +11,13 @@ NODE_RANK="${NODE_RANK:-0}"
 MODEL_DIR="${MODEL_DIR:-}"
 MODEL_NAME="${MODEL_NAME:-}"
 
+# GLM-5 requires transformers with glm_moe_dsa model type support
+if [[ "$MODEL_NAME" == *GLM-5* ]]; then
+    echo "[setup] Installing transformers for GLM-5 model type support..."
+    pip install -U --no-cache-dir \
+      "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" 2>&1 | tail -3
+fi
+
 xP="${xP:-1}" #-> Number of Prefill Workers
 yD="${yD:-1}" #-> Number of Decode Workers
 
diff --git a/benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh
new file mode 100755
index 000000000..6a7314ab4
--- /dev/null
+++ b/benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh
new file mode 100755
index 000000000..6a7314ab4
--- /dev/null
+++ b/benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9eab938a9..d63ffae43 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -15,10 +15,13 @@
 
 - config-keys:
     - glm5-fp8-mi325x-sglang
+    - glm5-fp8-mi325x-sglang-disagg
+    - qwen3.5-fp8-mi325x-sglang-disagg
   description:
-    - "Add GLM-5 FP8 single-node MI325X SGLang benchmark (TP8)"
-    - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends"
+    - "Add GLM-5 and Qwen3.5 FP8 MI325X benchmarks (single-node + disaggregated)"
+    - "Disagg: TP-only configs + single-node EP8/DP workaround (no MoRI a2a)"
     - "Image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt"
+    - "Ported from MI355X disagg configs (chun-chang/sglang-disagg-qwen3.5)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985
 
 - config-keys:

From 5228c620885cc1e7119d99140bdf42980395fd5f Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Thu, 9 Apr 2026 23:01:33 +0000
Subject: [PATCH 23/41] Fix HF cache path resolution: use sed instead of tr for
 org/repo separator

tr '/' '--' converts org/repo to org-repo (single dash), but HF hub
cache uses org--repo (double dash). Use sed 's|/|--|g' to produce the
correct models--org--repo directory name.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 runners/launch_mi325x-amd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh
index 41b354705..367ffad37 100644
--- a/runners/launch_mi325x-amd.sh
+++ b/runners/launch_mi325x-amd.sh
@@ -71,7 +71,7 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then
     # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container.
     # Auto-resolved from HF hub cache layout so no symlink is needed.
     if [[ -z "${MODEL_NAME:-}" ]]; then
-        _HF_DIR="models--$(echo "${MODEL}" | tr '/' '--')"
+        _HF_DIR="models--$(echo "${MODEL}" | sed 's|/|--|g')"
         _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
         if [[ -n "${_SNAPSHOT}" ]]; then
             export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"

From b08abaf8d6a9d4b5320e9610c8fe0f28702f5087 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 05:40:48 +0000
Subject: [PATCH 24/41] Sanitize MODEL_NAME in Docker container name

HF cache paths (models--org--repo/snapshots/hash) contain slashes
which are invalid in Docker container names. Sanitize MODEL_NAME
the same way as USER_NAME using tr.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 523bfd7c5..9634c7f1f 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -289,7 +289,8 @@ export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
-export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
+SANITIZED_MODEL=$(echo "$MODEL_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
+export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${SANITIZED_MODEL}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
 
 

From 6dbaa19723c3120c5caf0f93ba45cbc7d85b69bb Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 05:49:11 +0000
Subject: [PATCH 25/41] Force-reinstall transformers for GLM-5 in disagg Docker
 containers

Add --force-reinstall and diagnostic output to verify glm_moe_dsa
model type is available after install. Also match on MODEL_YAML_KEY
in case MODEL_NAME doesn't contain GLM-5.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 1384d6c9d..91571529b 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -12,10 +12,11 @@ MODEL_DIR="${MODEL_DIR:-}"
 MODEL_NAME="${MODEL_NAME:-}"
 
 # GLM-5 requires transformers with glm_moe_dsa model type support
-if [[ "$MODEL_NAME" == *GLM-5* ]]; then
+if [[ "$MODEL_NAME" == *GLM-5* ]] || [[ "$MODEL_YAML_KEY" == *GLM-5* ]]; then
     echo "[setup] Installing transformers for GLM-5 model type support..."
-    pip install -U --no-cache-dir \
-      "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" 2>&1 | tail -3
+    pip install -U --no-cache-dir --force-reinstall \
+      "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" 2>&1 | tail -5
+    python3 -c "import transformers; print(f'[setup] transformers {transformers.__version__}, glm_moe_dsa supported: {hasattr(transformers, \"GlmMoeDsaConfig\")}')" 2>&1
 fi
 
 xP="${xP:-1}" #-> Number of Prefill Workers

From 2c24d0d172f468c309d4ec87f82c88fe39be8787 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 06:31:53 +0000
Subject: [PATCH 26/41] Switch GLM-5 MI325X configs to v0.5.10 image

The v0.5.9 image doesn't recognize glm_moe_dsa model type even with
runtime transformers pip install (SGLang caches the old config registry).
Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt which has newer
transformers built in.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 25bcf30ca..67e0a04ac 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi325x-sglang:
-  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x
@@ -1800,7 +1800,7 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
 glm5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x-disagg

From d3522ec7d657e963512eb1aa72674a26751b9348 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 07:12:02 +0000
Subject: [PATCH 27/41] Switch GLM-5 MI325X to MI355X GLM-5 image (rocm/sgl-dev
 mori-0402)

The semianalysiswork images don't have glm_moe_dsa in SGLang's config
mapping. Use the MI355X GLM-5 disagg image which was specifically
built with GLM-5 support. Both MI355X and MI325X are gfx942 CDNA3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 67e0a04ac..56250f5e8 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi325x-sglang:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x
@@ -1800,7 +1800,7 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
 glm5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x-disagg

From 5dd235fd95fde84d9a508820e6e35df56a7961f7 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 15:40:12 +0000
Subject: [PATCH 28/41] Switch Qwen3.5/GLM-5 disagg to v0.5.10 image + no-MoRI
 transfer

Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt image and drop
--disaggregation-transfer-backend mori for Qwen3.5/GLM-5 disagg.
The v0.5.9 MoRI image's router never became ready for non-DeepSeek
models. Using no-MoRI model variants (Qwen3.5-397B-A17B-FP8-no-mori,
GLM-5-FP8-no-mori) to bypass MoRI entirely on bnxt_re.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml             | 70 ++++++++++++++++++++-
 benchmarks/multi_node/amd_utils/models.yaml | 64 +++++++++++++++++++
 2 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 56250f5e8..acb2e7db3 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi325x-sglang:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x
@@ -1623,7 +1623,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=3"
 
 qwen3.5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi325x-disagg
@@ -1646,6 +1646,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1653,6 +1655,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
@@ -1665,6 +1669,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #   decode:
     #     num-worker: 2
     #     tp: 8
@@ -1672,6 +1678,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1684,6 +1692,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1691,6 +1701,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
@@ -1703,6 +1715,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1710,6 +1724,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
 
     # Single-node EP8/DP workaround: no MoRI a2a, low conc
@@ -1722,6 +1738,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
@@ -1731,6 +1749,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
@@ -1752,6 +1772,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "PREFILL_NODES=2"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1759,6 +1781,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1771,6 +1795,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1778,6 +1804,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
@@ -1790,6 +1818,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1797,10 +1827,12 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
 
 glm5-fp8-mi325x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x-disagg
@@ -1823,6 +1855,8 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1830,6 +1864,8 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
@@ -1842,6 +1878,8 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
     #   decode:
     #     num-worker: 2
     #     tp: 8
@@ -1849,6 +1887,8 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1861,6 +1901,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1868,6 +1910,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
@@ -1880,6 +1924,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1887,6 +1933,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
 
     # Single-node EP8/DP workaround: no MoRI a2a, low conc
@@ -1899,6 +1947,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
         - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
         - "MODEL_NAME=GLM-5-FP8"
       decode:
@@ -1908,6 +1958,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
         - "MODEL_NAME=GLM-5-FP8"
@@ -1929,6 +1981,8 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "PREFILL_NODES=2"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1936,6 +1990,8 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1948,6 +2004,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1955,6 +2013,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
@@ -1967,6 +2027,8 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1974,4 +2036,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index e2d5e78eb..8b6686338 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -381,3 +381,67 @@ GLM-5-FP8-bnxt:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+
+# MI325X no-MoRI variants: drop disaggregation-transfer-backend mori entirely
+# Use if router never becomes ready with MoRI KV transfer on bnxt_re
+Qwen3.5-397B-A17B-FP8-no-mori:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
+  mtp_flags: ""
+  dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: 262144
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+GLM-5-FP8-no-mori:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\"enable_multithread_load\": true, \"num_threads\": 8}'"
+  mtp_flags: ""
+  dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: 262144
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"

From d8abc66ee15f4bfeb96bc3a5aa6ae6cd119850b3 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 15:40:12 +0000
Subject: [PATCH 29/41] Switch Qwen3.5/GLM-5 disagg to v0.5.10 image + no-MoRI
 transfer

Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt image and drop
--disaggregation-transfer-backend mori for Qwen3.5/GLM-5 disagg.
The v0.5.9 MoRI image's router never became ready for non-DeepSeek
models. Using no-MoRI model variants to bypass MoRI entirely.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index acb2e7db3..7c13f91f2 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1704,6 +1704,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1727,6 +1729,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
     # Single-node EP8/DP workaround: no MoRI a2a, low conc
     - spec-decoding: "none"
@@ -1807,6 +1811,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1830,6 +1836,8 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
+        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
 glm5-fp8-mi325x-sglang-disagg:
   image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
@@ -1913,6 +1921,8 @@ glm5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1936,6 +1946,8 @@ glm5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
 
     # Single-node EP8/DP workaround: no MoRI a2a, low conc
     - spec-decoding: "none"
@@ -2016,6 +2028,8 @@ glm5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -2039,3 +2053,5 @@ glm5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
+        - "MODEL_NAME=GLM-5-FP8"

From 44780e06d2901cd9a7a43ee19d8735b4205994be Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 16:06:56 +0000
Subject: [PATCH 30/41] Fix YAML: switch Qwen3.5/GLM-5 disagg to v0.5.10 +
 no-MoRI transfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt and drop MoRI
transfer backend for Qwen3.5/GLM-5 disagg on MI325X. Previous sed
edit broke YAML indentation — this commit fixes it with validated
Python-based editing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 48 ---------------------------------
 1 file changed, 48 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7c13f91f2..3dfa3ef5c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1646,8 +1646,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1655,8 +1653,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
@@ -1669,8 +1665,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #   decode:
     #     num-worker: 2
     #     tp: 8
@@ -1678,8 +1672,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1701,8 +1693,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
@@ -1726,8 +1716,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
@@ -1742,8 +1730,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
@@ -1753,8 +1739,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
@@ -1776,8 +1760,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "PREFILL_NODES=2"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1785,8 +1767,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1808,8 +1788,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
@@ -1833,8 +1811,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
         - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
@@ -1863,8 +1839,6 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -1872,8 +1846,6 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
@@ -1886,8 +1858,6 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: false
     #     additional-settings:
     #     - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
     #   decode:
     #     num-worker: 2
     #     tp: 8
@@ -1895,8 +1865,6 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -1918,8 +1886,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
@@ -1943,8 +1909,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
@@ -1959,8 +1923,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
         - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
         - "MODEL_NAME=GLM-5-FP8"
       decode:
@@ -1970,8 +1932,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
         - "MODEL_NAME=GLM-5-FP8"
@@ -1993,8 +1953,6 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "PREFILL_NODES=2"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
     #   decode:
     #     num-worker: 1
     #     tp: 8
@@ -2002,8 +1960,6 @@ glm5-fp8-mi325x-sglang-disagg:
     #     dp-attn: true
     #     additional-settings:
     #     - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
     #     - "DECODE_MTP_SIZE=0"
 
     # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
@@ -2025,8 +1981,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"
@@ -2050,8 +2004,6 @@ glm5-fp8-mi325x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
         - "MODEL_NAME=GLM-5-FP8"

From 21ce11aaf7b454ad5a64f23c1f6f95a30ac2263f Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 16:11:25 +0000
Subject: [PATCH 31/41] =?UTF-8?q?Remove=20MODEL=5FNAME=20overrides=20?=
 =?UTF-8?q?=E2=80=94=20let=20launcher=20resolve=20HF=20cache=20path?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MODEL_NAME=GLM-5-FP8 doesn't exist on disk; the actual path is
models--zai-org--GLM-5-FP8/snapshots/<hash>. The launcher's sed
fix already resolves this correctly when MODEL_NAME is unset.
Only MODEL_YAML_KEY override is needed for the no-MoRI model config.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 3dfa3ef5c..8c00326e7 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1371,7 +1371,6 @@ dsr1-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-        - "MODEL_NAME=DeepSeek-R1-0528"
       decode:
         num-worker: 1
         tp: 8
@@ -1381,7 +1380,6 @@ dsr1-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-        - "MODEL_NAME=DeepSeek-R1-0528"
 
   - isl: 8192
     osl: 1024
@@ -1548,7 +1546,6 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-        - "MODEL_NAME=DeepSeek-R1-0528"
       decode:
         num-worker: 1
         tp: 8
@@ -1558,7 +1555,6 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
         - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
-        - "MODEL_NAME=DeepSeek-R1-0528"
 
   - isl: 8192
     osl: 1024
@@ -1685,7 +1681,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1695,7 +1690,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1708,7 +1702,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1718,7 +1711,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
     # Single-node EP8/DP workaround: no MoRI a2a, low conc
     - spec-decoding: "none"
@@ -1731,7 +1723,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1741,7 +1732,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
         - "MORI_MAX_DISPATCH_TOKENS_DECODE=160"
         - "MORI_IO_QP_MAX_SEND_WR=4096"
         - "MORI_IO_QP_MAX_CQE=8192"
@@ -1780,7 +1770,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1790,7 +1779,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1803,7 +1791,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1813,7 +1800,6 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
-        - "MODEL_NAME=Qwen3.5-397B-A17B-FP8"
 
 glm5-fp8-mi325x-sglang-disagg:
   image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
@@ -1878,7 +1864,6 @@ glm5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1888,7 +1873,6 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1901,7 +1885,6 @@ glm5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1911,7 +1894,6 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
 
     # Single-node EP8/DP workaround: no MoRI a2a, low conc
     - spec-decoding: "none"
@@ -1924,7 +1906,6 @@ glm5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
-        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -1934,7 +1915,6 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-bnxt"
-        - "MODEL_NAME=GLM-5-FP8"
         - "MORI_MAX_DISPATCH_TOKENS_DECODE=160"
         - "MORI_IO_QP_MAX_SEND_WR=4096"
         - "MORI_IO_QP_MAX_CQE=8192"
@@ -1973,7 +1953,6 @@ glm5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 2
         tp: 8
@@ -1983,7 +1962,6 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
 
     # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
     - spec-decoding: "none"
@@ -1996,7 +1974,6 @@ glm5-fp8-mi325x-sglang-disagg:
         additional-settings:
         - "PREFILL_NODES=1"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"
       decode:
         num-worker: 1
         tp: 8
@@ -2006,4 +1983,3 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
-        - "MODEL_NAME=GLM-5-FP8"

From fc2f0d9dabaea6e440c806967379b689ab3e7604 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 17:59:15 +0000
Subject: [PATCH 32/41] Fix TP mismatch for non-MLA models in Qwen3.5/GLM-5
 disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Non-MLA models (Qwen3.5, GLM-5) require matched TP sizes between
prefill and decode for PD disaggregation. Changed P(tp4)→D(tp8) to
P(tp8)→D(tp8) for "Small scale" configs. DeepSeek-R1 (MLA) can use
mismatched TP because of compressed latent representations.

See sgl-project/sglang#15674

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8c00326e7..52d7c533c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1691,12 +1691,12 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
 
-    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    # "Small scale" (1P1D both TP8 — non-MLA models need matched TP)
     - spec-decoding: "none"
       conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
         dp-attn: false
         additional-settings:
@@ -1780,12 +1780,12 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
 
-    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    # "Small scale" (1P1D both TP8 — non-MLA models need matched TP)
     - spec-decoding: "none"
       conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
         dp-attn: false
         additional-settings:
@@ -1874,12 +1874,12 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
 
-    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    # "Small scale" (1P1D both TP8 — non-MLA models need matched TP)
     - spec-decoding: "none"
       conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
         dp-attn: false
         additional-settings:
@@ -1963,12 +1963,12 @@ glm5-fp8-mi325x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
         - "MODEL_YAML_KEY=GLM-5-FP8-no-mori"
 
-    # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8)
+    # "Small scale" (1P1D both TP8 — non-MLA models need matched TP)
     - spec-decoding: "none"
       conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
         dp-attn: false
         additional-settings:

From c956ce21d78bf843b5e678143d25c2a11bc5e5d2 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 18:26:03 +0000
Subject: [PATCH 33/41] Add MI325X container image build scripts and
 documentation

Build script for semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori:
- SGLang v0.5.10 (supports Qwen3.5, GLM-5, DeepSeek-R1)
- Broadcom Thor 2 bnxt_rocelib for IBGDA/RoCEv2
- MoRI disaggregated inference
- Based on JordanNanos/sglang fork's rocm.Dockerfile

Includes sbatch script for building on the MI325X cluster and
documentation of build args, prerequisites, and known issues.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/README-mi325x.md                | 78 +++++++++++++++++++++++
 docker/build-sglang-bnxt-mi325x.sbatch | 12 ++++
 docker/build-sglang-bnxt-mi325x.sh     | 85 ++++++++++++++++++++++++++
 3 files changed, 175 insertions(+)
 create mode 100644 docker/README-mi325x.md
 create mode 100644 docker/build-sglang-bnxt-mi325x.sbatch
 create mode 100755 docker/build-sglang-bnxt-mi325x.sh

diff --git a/docker/README-mi325x.md b/docker/README-mi325x.md
new file mode 100644
index 000000000..109e512e3
--- /dev/null
+++ b/docker/README-mi325x.md
@@ -0,0 +1,78 @@
+# MI325X Container Image Build
+
+## Image: `semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori`
+
+SGLang v0.5.10 container for AMD Instinct MI325X/MI300X (gfx942 CDNA3) with:
+- Broadcom Thor 2 RDMA support (bnxt_rocelib for RoCEv2 IBGDA)
+- MoRI disaggregated inference (KV cache transfer)
+- Qwen3.5 MoE (`qwen3_5_moe`), GLM-5 (`glm_moe_dsa`), DeepSeek-R1 model support
+- AITER optimized kernels, TileLang NSA backends
+
+## Prerequisites
+
+1. **Broadcom BCM driver**: Download `bcm5760x_231.2.63.0a.zip` from [Broadcom support portal](https://www.broadcom.com/support) and place in `docker/` directory.
+
+2. **Docker**: Must build on a node with Docker and GPU access. Use the sbatch script on the MI325X cluster.
+
+3. **Docker Hub access**: Push credentials for `semianalysiswork` org. PAT is in `/nfsdata/sa/.j9s/InferenceX/.env.local` as `DOCKER_HUB_PAT`, login user `clustermax`.
+
+## Build
+
+```bash
+# Option 1: Direct build (on a node with Docker)
+cd docker/
+bash build-sglang-bnxt-mi325x.sh
+
+# Option 2: Submit as Slurm job
+cd docker/
+sbatch build-sglang-bnxt-mi325x.sbatch
+```
+
+## Build process
+
+The script:
+1. Clones [JordanNanos/sglang](https://github.com/JordanNanos/sglang) which contains the ROCm Dockerfile with bnxt patches
+2. Copies the BCM driver into the build context
+3. Builds with `SGL_BRANCH=v0.5.10`, `GPU_ARCH=gfx942`, `ENABLE_MORI=1`, `NIC_BACKEND=ibgda`
+4. Pushes to `docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori`
+
+### What the Dockerfile builds
+
+- **Base**: `rocm/sgl-dev:rocm7-vllm-20250904` (ROCm 7.0 for gfx942)
+- **AITER**: v0.1.10.post3 (AMD optimized kernels)
+- **TileLang**: ML compiler for NSA backends (GLM-5)
+- **Mooncake**: Distributed training framework
+- **SGLang**: v0.5.10 (inference runtime)
+- **MoRI**: AMD MoRI networking with bnxt_rocelib for Broadcom Thor 2 IBGDA
+- **Broadcom bnxt_rocelib**: Compiled from BCM driver package
+
+### Build args reference
+
+| Arg | Default | Description |
+|-----|---------|-------------|
+| `SGL_BRANCH` | `v0.5.9` | SGLang git ref to build |
+| `GPU_ARCH` | `gfx950` | GPU arch: `gfx942` (MI300X/MI325X) or `gfx950` (MI355X) |
+| `ENABLE_MORI` | `0` | Set to `1` to build MoRI networking |
+| `NIC_BACKEND` | `none` | `ainic` (Pensando), `ibgda` (Broadcom), or `none` |
+| `BCM_DRIVER` | `bcm5760x_231.2.63.0a.zip` | BCM driver filename in build context |
+
+## Usage in InferenceX configs
+
+```yaml
+# .github/configs/amd-master.yaml
+dsr1-fp8-mi325x-sglang-disagg:
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori
+  ...
+```
+
+## Compatibility
+
+- **MI325X** (gfx942, CDNA3, Broadcom Thor 2 NICs) — primary target
+- **MI300X** (gfx942, CDNA3) — same architecture, works if NICs are compatible
+- **MI355X** (gfx950, CDNA4) — NOT compatible, use upstream `rocm/sgl-dev` images
+
+## Known issues
+
+- EP8/DP with `--moe-a2a-backend mori` hangs on bnxt_re — use default a2a kernels (see sgl-project/sglang#22072)
+- RDMA SQ overflow at high concurrency with EP8 — cap `MORI_IO_QP_MAX_SEND_WR=4096`
+- Non-MLA models (Qwen3.5, GLM-5) need matched TP sizes between prefill and decode (see sgl-project/sglang#15674)
diff --git a/docker/build-sglang-bnxt-mi325x.sbatch b/docker/build-sglang-bnxt-mi325x.sbatch
new file mode 100644
index 000000000..6e051c71f
--- /dev/null
+++ b/docker/build-sglang-bnxt-mi325x.sbatch
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH --job-name=build-sgl-bnxt
+#SBATCH --partition=compute
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=360
+#SBATCH --output=%u-build-mi325x-%j.log
+#SBATCH --error=%u-build-mi325x-%j.log
+#SBATCH --chdir=/tmp
+
+set -euo pipefail
+bash "${SLURM_SUBMIT_DIR}/build-sglang-bnxt-mi325x.sh"
diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh
new file mode 100755
index 000000000..be6efc71e
--- /dev/null
+++ b/docker/build-sglang-bnxt-mi325x.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Build SGLang container image for MI325X with Broadcom bnxt_re RDMA support.
+#
+# Prerequisites:
+#   - Docker installed and running on a compute node with GPU access
+#   - Broadcom BCM driver archive placed in docker/ directory (see BCM_DRIVER below)
+#   - Docker Hub credentials: login as 'clustermax' to semianalysiswork org
+#     (PAT in /nfsdata/sa/.j9s/InferenceX/.env.local as DOCKER_HUB_PAT)
+#
+# This image supports:
+#   - AMD Instinct MI325X (gfx942 CDNA3) — also works on MI300X (same arch)
+#   - SGLang v0.5.10 with Qwen3.5 MoE, GLM-5 MoE, DeepSeek-R1 model support
+#   - MoRI disaggregated inference with Broadcom Thor 2 IBGDA/RoCEv2
+#   - AITER optimized kernels, TileLang NSA backends
+#
+# Usage:
+#   cd /path/to/InferenceX/docker
+#   bash build-sglang-bnxt-mi325x.sh
+#
+# The image is pushed to:
+#   docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori
+#
+# Build reference: https://github.com/JordanNanos/sglang/tree/main/docker
+
+set -euo pipefail
+
+# ---------- Configuration ----------
+SGL_BRANCH="v0.5.10"
+GPU_ARCH="gfx942"
+IMAGE_TAG="semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori"
+DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git"
+DOCKERFILE_REF="main"
+
+# Broadcom BCM driver — must be placed in the build context directory.
+# Download from Broadcom support portal (requires account).
+BCM_DRIVER="bcm5760x_231.2.63.0a.zip"
+
+# ---------- Clone build repo ----------
+WORK_DIR=$(mktemp -d)
+echo "[build] Cloning ${DOCKERFILE_REPO} (ref: ${DOCKERFILE_REF}) into ${WORK_DIR}"
+git clone --depth 1 --branch "${DOCKERFILE_REF}" "${DOCKERFILE_REPO}" "${WORK_DIR}/sglang"
+
+# ---------- Copy BCM driver into build context ----------
+BUILD_CONTEXT="${WORK_DIR}/sglang/docker"
+if [[ -f "${BCM_DRIVER}" ]]; then
+    cp "${BCM_DRIVER}" "${BUILD_CONTEXT}/"
+    echo "[build] BCM driver copied: ${BCM_DRIVER}"
+elif [[ -f "/root/cache/${BCM_DRIVER}" ]]; then
+    cp "/root/cache/${BCM_DRIVER}" "${BUILD_CONTEXT}/"
+    echo "[build] BCM driver copied from /root/cache/"
+else
+    echo "ERROR: BCM driver not found: ${BCM_DRIVER}"
+    echo "Place it in the current directory or /root/cache/"
+    exit 1
+fi
+
+# ---------- Docker login ----------
+if [[ -f /nfsdata/sa/.j9s/InferenceX/.env.local ]]; then
+    source /nfsdata/sa/.j9s/InferenceX/.env.local
+    echo "${DOCKER_HUB_PAT}" | docker login -u clustermax --password-stdin
+fi
+
+# ---------- Build ----------
+echo "[build] Building ${IMAGE_TAG}"
+echo "[build]   SGL_BRANCH=${SGL_BRANCH}"
+echo "[build]   GPU_ARCH=${GPU_ARCH}"
+echo "[build]   ENABLE_MORI=1, NIC_BACKEND=ibgda"
+
+docker build \
+    --build-arg SGL_BRANCH="${SGL_BRANCH}" \
+    --build-arg GPU_ARCH="${GPU_ARCH}" \
+    --build-arg ENABLE_MORI=1 \
+    --build-arg NIC_BACKEND=ibgda \
+    --build-arg BCM_DRIVER="${BCM_DRIVER}" \
+    -t "${IMAGE_TAG}" \
+    -f "${BUILD_CONTEXT}/rocm.Dockerfile" \
+    "${BUILD_CONTEXT}/"
+
+# ---------- Push ----------
+echo "[build] Pushing ${IMAGE_TAG}"
+docker push "${IMAGE_TAG}"
+
+# ---------- Cleanup ----------
+rm -rf "${WORK_DIR}"
+echo "[build] Done: ${IMAGE_TAG}"

From 18f1c5cecd9fc07e37341c01eec272d9a0481a99 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 18:28:07 +0000
Subject: [PATCH 34/41] Use latest SGLang main for MI325X image build

Default SGL_BRANCH=main to get full model type support (qwen3_5_moe,
glm_moe_dsa, etc). v0.5.10 may not have GLM-5 glm_moe_dsa support.
Both SGL_BRANCH and IMAGE_TAG are now overridable via env vars.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/README-mi325x.md            | 10 ++++++----
 docker/build-sglang-bnxt-mi325x.sh | 11 +++++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/docker/README-mi325x.md b/docker/README-mi325x.md
index 109e512e3..cec903b9c 100644
--- a/docker/README-mi325x.md
+++ b/docker/README-mi325x.md
@@ -1,8 +1,8 @@
 # MI325X Container Image Build
 
-## Image: `semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori`
+## Image: `semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori`
 
-SGLang v0.5.10 container for AMD Instinct MI325X/MI300X (gfx942 CDNA3) with:
+SGLang (latest main) container for AMD Instinct MI325X/MI300X (gfx942 CDNA3) with:
 - Broadcom Thor 2 RDMA support (bnxt_rocelib for RoCEv2 IBGDA)
 - MoRI disaggregated inference (KV cache transfer)
 - Qwen3.5 MoE (`qwen3_5_moe`), GLM-5 (`glm_moe_dsa`), DeepSeek-R1 model support
@@ -33,8 +33,10 @@ sbatch build-sglang-bnxt-mi325x.sbatch
 The script:
 1. Clones [JordanNanos/sglang](https://github.com/JordanNanos/sglang) which contains the ROCm Dockerfile with bnxt patches
 2. Copies the BCM driver into the build context
-3. Builds with `SGL_BRANCH=v0.5.10`, `GPU_ARCH=gfx942`, `ENABLE_MORI=1`, `NIC_BACKEND=ibgda`
-4. Pushes to `docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori`
+3. Builds with `SGL_BRANCH=main` (latest, supports all model types), `GPU_ARCH=gfx942`, `ENABLE_MORI=1`, `NIC_BACKEND=ibgda`
+4. Pushes to `docker.io/semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori`
+
+Override defaults: `SGL_BRANCH=v0.5.10 IMAGE_TAG=semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori bash build-sglang-bnxt-mi325x.sh`
 
 ### What the Dockerfile builds
 
diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh
index be6efc71e..8dcaa274f 100755
--- a/docker/build-sglang-bnxt-mi325x.sh
+++ b/docker/build-sglang-bnxt-mi325x.sh
@@ -9,7 +9,7 @@
 #
 # This image supports:
 #   - AMD Instinct MI325X (gfx942 CDNA3) — also works on MI300X (same arch)
-#   - SGLang v0.5.10 with Qwen3.5 MoE, GLM-5 MoE, DeepSeek-R1 model support
+#   - SGLang (latest main) with Qwen3.5 MoE, GLM-5 MoE, DeepSeek-R1 model support
 #   - MoRI disaggregated inference with Broadcom Thor 2 IBGDA/RoCEv2
 #   - AITER optimized kernels, TileLang NSA backends
 #
@@ -18,16 +18,19 @@
 #   bash build-sglang-bnxt-mi325x.sh
 #
 # The image is pushed to:
-#   docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori
+#   docker.io/semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori
+#
+# Override defaults with env vars:
+#   SGL_BRANCH=v0.5.10 IMAGE_TAG=semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori bash build-sglang-bnxt-mi325x.sh
 #
 # Build reference: https://github.com/JordanNanos/sglang/tree/main/docker
 
 set -euo pipefail
 
 # ---------- Configuration ----------
-SGL_BRANCH="v0.5.10"
+SGL_BRANCH="${SGL_BRANCH:-main}"
 GPU_ARCH="gfx942"
-IMAGE_TAG="semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori"
+IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori}"
 DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git"
 DOCKERFILE_REF="main"
 

From 13be2f6324ad596f677224a51fa81536699883a0 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 18:31:35 +0000
Subject: [PATCH 35/41] Update build script default to SGL_BRANCH=v0.5.10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rebuilds semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt with proper
model type registration for qwen3_5_moe and glm_moe_dsa.

BCM driver (bcm5760x_231.2.63.0a.zip) is at /nfsdata/sa/.j9s/ on
the cluster — too large for git (192MB).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/build-sglang-bnxt-mi325x.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh
index 8dcaa274f..22e5f923c 100755
--- a/docker/build-sglang-bnxt-mi325x.sh
+++ b/docker/build-sglang-bnxt-mi325x.sh
@@ -28,9 +28,9 @@
 set -euo pipefail
 
 # ---------- Configuration ----------
-SGL_BRANCH="${SGL_BRANCH:-main}"
+SGL_BRANCH="${SGL_BRANCH:-v0.5.10}"
 GPU_ARCH="gfx942"
-IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori}"
+IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt}"
 DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git"
 DOCKERFILE_REF="main"
 

From 9ec6e9daa39587d33be5e4bb9497e816a9a3f0cf Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 19:32:34 +0000
Subject: [PATCH 36/41] Add transformers patch layer for GLM-5/Qwen3.5 model
 type support

SGLang v0.5.10 pins an older transformers that doesn't have
glm_moe_dsa or qwen3_5_moe in its AutoConfig registry. Add a
thin Dockerfile layer that upgrades transformers to the commit
with these model types and verifies registration.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/build-sglang-bnxt-mi325x.sh   | 11 +++++++++++
 docker/patch-transformers.Dockerfile | 10 ++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 docker/patch-transformers.Dockerfile

diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh
index 22e5f923c..d23740c6e 100755
--- a/docker/build-sglang-bnxt-mi325x.sh
+++ b/docker/build-sglang-bnxt-mi325x.sh
@@ -79,6 +79,17 @@ docker build \
     -f "${BUILD_CONTEXT}/rocm.Dockerfile" \
     "${BUILD_CONTEXT}/"
 
+# ---------- Patch transformers for GLM-5/Qwen3.5 model type support ----------
+PATCH_DOCKERFILE="$(dirname "$0")/patch-transformers.Dockerfile"
+if [[ -f "${PATCH_DOCKERFILE}" ]]; then
+    echo "[build] Patching transformers for glm_moe_dsa/qwen3_5_moe support"
+    docker build \
+        --build-arg BASE_IMAGE="${IMAGE_TAG}" \
+        -t "${IMAGE_TAG}" \
+        -f "${PATCH_DOCKERFILE}" \
+        "$(dirname "$0")/"
+fi
+
 # ---------- Push ----------
 echo "[build] Pushing ${IMAGE_TAG}"
 docker push "${IMAGE_TAG}"
diff --git a/docker/patch-transformers.Dockerfile b/docker/patch-transformers.Dockerfile
new file mode 100644
index 000000000..f89c44c25
--- /dev/null
+++ b/docker/patch-transformers.Dockerfile
@@ -0,0 +1,10 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+# Upgrade transformers to get glm_moe_dsa and qwen3_5_moe model type support.
+# The SGLang v0.5.10 base image pins an older transformers that doesn't have these.
+RUN pip install --no-cache-dir -U \
+    "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" \
+    && python3 -c "import transformers; print(f'transformers {transformers.__version__}')" \
+    && python3 -c "from transformers import AutoConfig; AutoConfig.for_model('glm_moe_dsa')" \
+    && echo "glm_moe_dsa model type verified"

From 02645c76c3c915b81e89366e20312b806a6081cc Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 22:02:49 +0000
Subject: [PATCH 37/41] Build from SGLang main for Qwen3.5/GLM-5 PD disagg
 fixes

v0.5.10 is missing merged PRs #19002, #19076, #19086 that fix
non-MLA PD disaggregation (GQA head count, KV cache slice transfer,
mamba cache size). Building from main includes all these fixes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/build-sglang-bnxt-mi325x.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh
index d23740c6e..20de93517 100755
--- a/docker/build-sglang-bnxt-mi325x.sh
+++ b/docker/build-sglang-bnxt-mi325x.sh
@@ -28,9 +28,9 @@
 set -euo pipefail
 
 # ---------- Configuration ----------
-SGL_BRANCH="${SGL_BRANCH:-v0.5.10}"
+SGL_BRANCH="${SGL_BRANCH:-main}"
 GPU_ARCH="gfx942"
-IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt}"
+IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:main-bnxt}"
 DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git"
 DOCKERFILE_REF="main"
 

From 947e33995f9b00b4e3a46d37e3b093dbbbcc6c71 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Fri, 10 Apr 2026 22:53:52 +0000
Subject: [PATCH 38/41] Switch Qwen3.5/GLM-5 to main-bnxt image with PD disagg
 fixes

Image semianalysiswork/sgl-bnxt-cdna3:main-bnxt built from SGLang
main which includes PRs #19002, #19076, #19086 fixing non-MLA PD
disaggregation (GQA head count, KV cache transfer, mamba cache).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 52d7c533c..a146b0ea4 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi325x-sglang:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x
@@ -1619,7 +1619,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=3"
 
 qwen3.5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi325x-disagg
@@ -1802,7 +1802,7 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
 
 glm5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x-disagg

From d64877420c4da34ce8fda9b7b0b0de2a94fec9ef Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Sat, 11 Apr 2026 00:57:22 +0000
Subject: [PATCH 39/41] Switch to v0.5.10-bnxt-patched (PD fixes + transformers
 patch)

SGLang main broke the disagg router for all models. Use v0.5.10
(has PD fixes #19086 merged) + transformers patch for model type
recognition. v0.5.10 has working router + PD KV cache fixes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a146b0ea4..9083dddf3 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi325x-sglang:
-  image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x
@@ -1619,7 +1619,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=3"
 
 qwen3.5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi325x-disagg
@@ -1802,7 +1802,7 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
 
 glm5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt
+  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x-disagg

From d6053e1c9c5d454cd966ff505ca5e678c762e430 Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Sat, 11 Apr 2026 03:42:02 +0000
Subject: [PATCH 40/41] Add thin bnxt layer Dockerfile for existing SGLang
 images

add-bnxt.Dockerfile adds Broadcom bnxt_rocelib + transformers patch
on top of any SGLang ROCm base image. Faster than full rebuild when
the base image already has the right SGLang version.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker/add-bnxt.Dockerfile | 48 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 docker/add-bnxt.Dockerfile

diff --git a/docker/add-bnxt.Dockerfile b/docker/add-bnxt.Dockerfile
new file mode 100644
index 000000000..9bb82eb58
--- /dev/null
+++ b/docker/add-bnxt.Dockerfile
@@ -0,0 +1,48 @@
+# Thin layer that adds Broadcom bnxt_rocelib RDMA support to any SGLang ROCm image.
+# Usage:
+#   docker build --build-arg BASE_IMAGE=lmsysorg/sglang:v0.5.9-rocm700-mi30x \
+#     -t semianalysiswork/sgl-bnxt-cdna3:v0.5.9-bnxt-lite \
+#     -f add-bnxt.Dockerfile .
+
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+# Install RDMA build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libibumad-dev rdma-core ibverbs-utils infiniband-diags \
+    gcc make libtool autoconf librdmacm-dev rdmacm-utils \
+    perftest ethtool libibverbs-dev strace \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Broadcom bnxt_rocelib
+ARG BCM_DRIVER=bcm5760x_231.2.63.0a.zip
+COPY ${BCM_DRIVER} /tmp/${BCM_DRIVER}
+RUN cd /tmp && \
+    case "${BCM_DRIVER}" in \
+      *.zip) apt-get update && apt-get install -y unzip && unzip -o ./${BCM_DRIVER} ;; \
+      *.tar.gz) tar zxf ./${BCM_DRIVER} ;; \
+      *) echo "ERROR: unsupported archive: ${BCM_DRIVER}" && exit 1 ;; \
+    esac && \
+    DIR_NAME="${BCM_DRIVER%.*}" && \
+    # Handle double extension (.tar.gz)
+    case "${BCM_DRIVER}" in *.tar.gz) DIR_NAME="${BCM_DRIVER%.tar.gz}" ;; esac && \
+    cd /tmp/${DIR_NAME}/drivers_linux/bnxt_rocelib && \
+    BCM_LIB=$(ls -1 *.tar.gz) && \
+    tar zxf ${BCM_LIB} && \
+    cd "${BCM_LIB%.tar.gz}" && \
+    sh ./autogen.sh && \
+    sh ./configure && \
+    make -j8 && \
+    # Backup inbox drivers and install
+    find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; 2>/dev/null || true && \
+    make install && \
+    echo /usr/local/lib >> /etc/ld.so.conf && \
+    ldconfig && \
+    # Cleanup
+    rm -rf /tmp/${BCM_DRIVER} /tmp/${DIR_NAME} && \
+    echo "bnxt_rocelib installed successfully"
+
+# Upgrade transformers for glm_moe_dsa and qwen3_5_moe model type support
+RUN pip install --no-cache-dir -U \
+    "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" \
+    && python3 -c "import transformers; print(f'transformers {transformers.__version__}')"

From 757d01536e6a0e38f90b1fb336cf012ea4326cda Mon Sep 17 00:00:00 2001
From: JordanNanos <jordan@semianalysis.com>
Date: Sat, 11 Apr 2026 03:45:34 +0000
Subject: [PATCH 41/41] Switch Qwen3.5/GLM-5 to amd-disagg-bnxt-lite image

Built from rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
(AMD's PD disagg image) + bnxt_rocelib + transformers patch.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9083dddf3..c2b55425f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp:
     - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi325x-sglang:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched
+  image: semianalysiswork/sgl-bnxt-cdna3:amd-disagg-bnxt-lite
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x
@@ -1619,7 +1619,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=3"
 
 qwen3.5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched
+  image: semianalysiswork/sgl-bnxt-cdna3:amd-disagg-bnxt-lite
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi325x-disagg
@@ -1802,7 +1802,7 @@ qwen3.5-fp8-mi325x-sglang-disagg:
         - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori"
 
 glm5-fp8-mi325x-sglang-disagg:
-  image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched
+  image: semianalysiswork/sgl-bnxt-cdna3:amd-disagg-bnxt-lite
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
   runner: mi325x-disagg