Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6e39343
feat: MI300X disaggregated inference with Broadcom IBGDA (issue #982)
JordanNanos Apr 2, 2026
4071ffe
fix: address PR review comments
JordanNanos Apr 2, 2026
fba085b
fix: resolve HF cache snapshot path on compute node in job.slurm
JordanNanos Apr 3, 2026
65e8f99
fix: use glob to resolve HF cache dir with unknown org prefix
JordanNanos Apr 3, 2026
d407492
Merge remote-tracking branch 'origin/main' into jordan/mi300x-disagg-…
JordanNanos Apr 3, 2026
8cacb31
fix: move MI300X/MI325X disagg image to semianalysiswork Docker Hub
JordanNanos Apr 3, 2026
27fab49
fix: replace srun-based IP/model checks with scontrol in job.slurm
JordanNanos Apr 4, 2026
a4e56c3
ci: add 2-node-only disagg config keys for MI300X
JordanNanos Apr 5, 2026
fa21716
fix: submit sbatch as root so srun steps work on worker nodes
Apr 5, 2026
3683c13
fix: use squeue -j JOB_ID instead of squeue -u USER for job polling
Apr 5, 2026
f9b16f0
fix: use sudo -E to preserve environment when submitting sbatch as root
Apr 5, 2026
c1ca311
fix: sanitize MODEL_NAME in Docker container name
Apr 5, 2026
27a0836
fix: add timeout to NFS refresh srun step to prevent infinite hang
Apr 5, 2026
5626990
fix: use timeout wrapper instead of invalid srun --timeout flag
Apr 5, 2026
46a147c
fix: increase prefill TP from 4 to 8 in 2-node configs for R1-0528
Apr 5, 2026
691151d
fix: fall back to /vfs/models_blog when HF cache blobs are missing
Apr 6, 2026
7fa054e
fix: auto-configure UFW to allow cross-node external IP traffic
Apr 6, 2026
300ca05
fix: bind container barrier server to 0.0.0.0 for cross-subnet access
Apr 6, 2026
5a8ffec
fix: handle zero TPOT in process_result.py to avoid ZeroDivisionError
Apr 6, 2026
1b936c5
fix: address review comments on process_result.py and job.slurm
Apr 8, 2026
0ab1fcd
Merge remote-tracking branch 'origin/main' into jordan/mi300x-disagg-…
Apr 8, 2026
21f5eef
ci: re-trigger sweep after UFW fix on node 049
Apr 8, 2026
1237dac
ci: re-run sweep for full pareto coverage
Apr 8, 2026
7c3275d
Merge remote-tracking branch 'origin/main' into jordan/mi300x-disagg-…
Apr 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@ dsr1-fp8-mi300x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi300x-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }

dsr1-fp8-mi325x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -1272,3 +1290,187 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsr1-fp8-mi300x-sglang-disagg:
image: ghcr.io/jordannanos/sgl-mi300x-mori:v0.5.9-bnxt
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JordanNanos check ur email for access to https://hub.docker.com/u/semianalysiswork

nit: can u move this to our official SemiAnalysisAI public docker hub

also nit: why is mi325 disagg image different from mi300 disagg image? mi325 & mi300 is both cdna3 architecture just how how h100 & h200 are the same architecture/stack and a single image should work should mi300 & mi325.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah it's the same image, just changed the tag

model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

- isl: 8192
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

dsr1-fp8-mi300x-sglang-disagg-mtp:
image: ghcr.io/jordannanos/sgl-mi300x-mori:v0.5.9-bnxt
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"

- isl: 8192
osl: 1024
search-space:
# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"
4 changes: 4 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ mi300x:
- 'mi300x-amds_1'
- 'mi300x-amds_2'
- 'mi300x-amds_3'
mi300x-disagg:
- 'mi300x-amds_0'
- 'mi300x-amds_2'
- 'mi300x-amds_3'
mi325x:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
Expand Down
16 changes: 16 additions & 0 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ if [[ -z "$IBDEVICES" ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
elif [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI300X cluster: Broadcom RoCE (bnxt_re); all 8 devices present
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
else
echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
exit 1
Expand Down Expand Up @@ -101,6 +107,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
fi
Expand All @@ -114,6 +125,11 @@ else
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
echo " This is normal for clusters without QoS or outside Docker containers."
Expand Down
38 changes: 28 additions & 10 deletions benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
exit 1
fi

# Validate MODEL_NAME exists as a top-level key in models.yaml
if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
echo "Error: Model '$MODEL_NAME' not found in models.yaml"
# MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"

# Validate the yaml key exists as a top-level key in models.yaml
if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
echo "Available models:"
grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /'
exit 1
fi
echo "Model found: $MODEL_NAME"
echo "Model found: $_MODEL_YAML_KEY"

# All models use server.sh as the entrypoint
RUN_FILE="server.sh"
Expand Down Expand Up @@ -133,6 +137,20 @@ check_model_path() {
fi
}

# If MODEL_NAME is a plain name (not already a HF cache path), try to resolve
# the HF hub cache layout on this node: models--{org}--{repo}/snapshots/<hash>
# This handles clusters where the cache is node-local and can't be resolved
# from the job launcher (which may run on a different host).
if [[ "$MODEL_NAME" != models--* ]] && [[ "$MODEL_NAME" != *snapshots* ]]; then
_HF_ORG_REPO="${MODEL_YAML_KEY:-$MODEL_NAME}"
_HF_DIR="models--$(echo "${_HF_ORG_REPO}" | tr '/' '--')"
_SNAPSHOT=$(ls "${MODEL_DIR}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
if [[ -n "${_SNAPSHOT}" ]]; then
MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
echo "Resolved MODEL_NAME from local HF cache: ${MODEL_NAME}"
fi
fi

# Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
Expand Down Expand Up @@ -249,10 +267,9 @@ echo "NNODES is ${NNODES}"
echo "REPO Directory is ${DI_REPO_DIR}"
echo "USER_NAME is ${USER_NAME}"

# Get the RDMA priority and DSCP value from the NIC
# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
if ! command -v nicctl >/dev/null 2>&1; then
echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
exit 1
echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
fi

# Reduce log spam
Expand Down Expand Up @@ -296,8 +313,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)

cleanup() {
echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
# clean up the logs folder
sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
# NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true

echo "[${SLURM_JOB_ID}] cleanup done."
}
Expand Down Expand Up @@ -357,7 +374,7 @@ exec sudo docker run --rm \
--privileged \
-v ${MODEL_DIR}:/models \
-v \$HOME/.ssh:/root/.ssh \
-v $(which nicctl):/usr/sbin/nicctl \
$(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
--shm-size 128G \
-v /tmp:/run_logs \
-v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
Expand All @@ -373,6 +390,7 @@ exec sudo docker run --rm \
-e xP=\$xP \
-e yD=\$yD \
-e MODEL_NAME=\$MODEL_NAME \
-e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
-e IPADDRS=\$IPADDRS \
-e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
-e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
Expand Down
10 changes: 9 additions & 1 deletion benchmarks/multi_node/amd_utils/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ fi
# Load model config via inline Python (PyYAML is available in SGLang containers)
# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
# is done here in Python to avoid bash glob-expanding the * characters.
_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
eval "$(python3 -c "
import yaml, sys, os

config_path = '${MODELS_YAML}'
model_name = '${MODEL_NAME}'
model_name = '${_MODEL_YAML_KEY}'

with open(config_path) as f:
models = yaml.safe_load(f)
Expand Down Expand Up @@ -212,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
fi

# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be
# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch
# token limit when DP is enabled to satisfy this assertion.
if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then
MORI_MAX_DISPATCH_TOKENS_DECODE=1024
fi

# =============================================================================
# Cluster Topology Configuration
# =============================================================================
Expand Down
Loading
Loading