SemiAnalysisAI · JordanNanos · Apr 2, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026
@@ -76,6 +76,24 @@ dsr1-fp8-mi300x-sglang:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
+dsr1-fp8-mi300x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi300x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
 dsr1-fp8-mi325x-sglang:
   image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
   model: deepseek-ai/DeepSeek-R1-0528
@@ -1272,3 +1290,187 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp8-mi300x-sglang-disagg:
+  image: ghcr.io/jordannanos/sgl-mi300x-mori:v0.5.9-bnxt
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi300x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+dsr1-fp8-mi300x-sglang-disagg-mtp:
+  image: ghcr.io/jordannanos/sgl-mi300x-mori:v0.5.9-bnxt
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi300x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -70,6 +70,10 @@ mi300x:
 - 'mi300x-amds_1'
 - 'mi300x-amds_2'
 - 'mi300x-amds_3'
+mi300x-disagg:
+- 'mi300x-amds_0'
+- 'mi300x-amds_2'
+- 'mi300x-amds_3'
 mi325x:
 - 'mi325x-amd_0'
 - 'mi325x-amd_1'

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
@@ -20,6 +20,12 @@ if [[ -z "$IBDEVICES" ]]; then
         export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
     elif [[ $NODENAME == mia1* ]]; then
         export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
+        export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
+    elif [[ $NODENAME == chi-mi300x* ]]; then
+        # Vultr/CPE MI300X cluster: Broadcom RoCE (bnxt_re); all 8 devices present
+        export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
     else
         echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
         exit 1
@@ -101,6 +107,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         elif [[ $NODENAME == mia1* ]]; then
             export MORI_RDMA_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+        elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
+            # Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+            export MORI_RDMA_TC=104
+            export MORI_RDMA_SL=3
+            echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
         else
             echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
         fi
@@ -114,6 +125,11 @@ else
     elif [[ $NODENAME == mia1* ]]; then
         export MORI_RDMA_TC=104
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+    elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
+        # Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+        export MORI_RDMA_TC=104
+        export MORI_RDMA_SL=3
+        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
     else
         echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
         echo "       This is normal for clusters without QoS or outside Docker containers."

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
@@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
-if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+# MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
+# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
+
+# Validate the yaml key exists as a top-level key in models.yaml
+if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
+    echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
-echo "Model found: $MODEL_NAME"
+echo "Model found: $_MODEL_YAML_KEY"
 
 # All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
@@ -133,6 +137,20 @@ check_model_path() {
     fi
 }
 
+# If MODEL_NAME is a plain name (not already a HF cache path), try to resolve
+# the HF hub cache layout on this node: models--{org}--{repo}/snapshots/<hash>
+# This handles clusters where the cache is node-local and can't be resolved
+# from the job launcher (which may run on a different host).
+if [[ "$MODEL_NAME" != models--* ]] && [[ "$MODEL_NAME" != *snapshots* ]]; then
+    _HF_ORG_REPO="${MODEL_YAML_KEY:-$MODEL_NAME}"
+    _HF_DIR="models--$(echo "${_HF_ORG_REPO}" | tr '/' '--')"
+    _SNAPSHOT=$(ls "${MODEL_DIR}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
+    if [[ -n "${_SNAPSHOT}" ]]; then
+        MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
+        echo "Resolved MODEL_NAME from local HF cache: ${MODEL_NAME}"
+    fi
+fi
+
 # Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
 if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
     MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
@@ -249,10 +267,9 @@ echo "NNODES is ${NNODES}"
 echo "REPO Directory is ${DI_REPO_DIR}"
 echo "USER_NAME is ${USER_NAME}"
 
-# Get the RDMA priority and DSCP value from the NIC
+# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
 if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
+    echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
 fi
 
 # Reduce log spam
@@ -296,8 +313,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
 cleanup() {
   echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
+  # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+  timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
 
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
@@ -357,7 +374,7 @@ exec sudo docker run --rm \
     --privileged \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
+    $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
@@ -373,6 +390,7 @@ exec sudo docker run --rm \
     -e xP=\$xP \
     -e yD=\$yD \
     -e MODEL_NAME=\$MODEL_NAME \
+    -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
     -e IPADDRS=\$IPADDRS \
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
@@ -72,11 +72,12 @@ fi
 # Load model config via inline Python (PyYAML is available in SGLang containers)
 # Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
 # is done here in Python to avoid bash glob-expanding the * characters.
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
 eval "$(python3 -c "
 import yaml, sys, os
 
 config_path = '${MODELS_YAML}'
-model_name = '${MODEL_NAME}'
+model_name = '${_MODEL_YAML_KEY}'
 
 with open(config_path) as f:
     models = yaml.safe_load(f)
@@ -212,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
 fi
 
+# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be
+# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch
+# token limit when DP is enabled to satisfy this assertion.
+if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then
+    MORI_MAX_DISPATCH_TOKENS_DECODE=1024
+fi
+
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================