From 834fe82a4302e7ec27c9230bb4caf9a352db56d4 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 31 Mar 2026 17:05:53 +0000 Subject: [PATCH 01/41] Add MI325X DeepSeek-R1 FP8 disaggregated inference (1P1D, Broadcom Thor 2 IBGDA) Port the MI355X disagg recipe to MI325X (gfx942/CDNA3) on a Vultr Slurm cluster with Broadcom BCM5760X Thor 2 NICs using IBGDA for GPU-Direct RDMA via MoRI. Container image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt Built from akao-amd/sglang rocm.Dockerfile with: - GPU_ARCH=gfx942, ENABLE_MORI=1, NIC_BACKEND=ibgda - Broadcom bnxt_rocelib (bcm5760x_231.2.63.0a) for RDMA userspace - MoRI pinned to HEAD (c0eccaf2) for bundled bnxt headers + dlopen - smg-wasm pinned to =1.0.0 (v1.0.1 breaks sgl-model-gateway v0.5.9 API) Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 149 +++++++++++++ .github/configs/runners.yaml | 5 + benchmarks/multi_node/amd_utils/env.sh | 3 + benchmarks/multi_node/amd_utils/job.slurm | 20 +- benchmarks/multi_node/amd_utils/server.sh | 3 +- .../dsr1_fp8_mi325x_sglang-disagg.sh | 82 +++++++ runners/launch_mi325x-amd.sh | 206 +++++++++++++++--- scripts/manual-test-mi325x.sh | 37 ++++ 8 files changed, 467 insertions(+), 38 deletions(-) create mode 100755 benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh create mode 100755 scripts/manual-test-mi325x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e84fc0da5..6da2a4e22 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1231,3 +1231,152 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +dsr1-fp8-mi325x-sglang-disagg: + image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) + - spec-decoding: "none" + conc-list: [ 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + - spec-decoding: "none" + conc-list: [ 768, 512, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + - spec-decoding: "none" + conc-list: [ 512, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 1251e459d..f61e81e36 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -75,6 +75,11 @@ mi325x: - 'mi325x-amd_1' - 'mi325x-amd_2' - 'mi325x-amd_3' +mi325x-disagg: +- 'mi325x-amd_0' +- 'mi325x-amd_1' +- 'mi325x-amd_2' +- 'mi325x-amd_3' mi355x: - 'mi355x-amds_0' - 'mi355x-amds_1' diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..56572dfcf 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 elif [[ $NODENAME == mia1* ]]; then export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + elif [[ $NODENAME == chi-mi325x* ]]; then + # Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it + export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8 else echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 exit 1 diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 6b0352f24..0e8f465f5 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml -if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" +# MODEL_YAML_KEY is the models.yaml lookup key (bare model name, e.g. DeepSeek-R1-0528). +# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/). +_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}" + +# Validate the yaml key exists as a top-level key in models.yaml +if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then + echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi -echo "Model found: $MODEL_NAME" +echo "Model found: $_MODEL_YAML_KEY" # All models use server.sh as the entrypoint RUN_FILE="server.sh" @@ -249,10 +253,9 @@ echo "NNODES is ${NNODES}" echo "REPO Directory is ${DI_REPO_DIR}" echo "USER_NAME is ${USER_NAME}" -# Get the RDMA priority and DSCP value from the NIC +# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully) if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 + echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2 fi # Reduce log spam @@ -357,7 +360,7 @@ exec sudo docker run --rm \ --privileged \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ + $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ @@ -373,6 +376,7 @@ exec sudo docker run --rm \ -e xP=\$xP \ -e yD=\$yD \ -e MODEL_NAME=\$MODEL_NAME \ + -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \ -e IPADDRS=\$IPADDRS \ -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 7f174b760..b477790b3 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -72,11 +72,12 @@ fi # Load model config via inline Python (PyYAML is available in SGLang containers) # Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") # is done here in Python to avoid bash glob-expanding the * characters. +_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}" eval "$(python3 -c " import yaml, sys, os config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' +model_name = '${_MODEL_YAML_KEY}' with open(config_path) as f: models = yaml.safe_load(f) diff --git a/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh new file mode 100755 index 000000000..6a7314ab4 --- /dev/null +++ b/benchmarks/multi_node/dsr1_fp8_mi325x_sglang-disagg.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 67f93a309..4e76c205a 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -4,37 +4,185 @@ export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" export PORT=8888 PARTITION="compute" -SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" -set -x - -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') - -if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job" +# Detect benchmark subdir from where the script lives +SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh" +if [[ -f "benchmarks/multi_node/${SCRIPT_NAME}" ]]; then + BENCHMARK_SUBDIR="multi_node" +elif [[ -f "benchmarks/single_node/${SCRIPT_NAME}" ]]; then + BENCHMARK_SUBDIR="single_node" +else + echo "ERROR: ${SCRIPT_NAME} not found in benchmarks/multi_node or benchmarks/single_node" exit 1 fi -# Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE +# ============================================================================= +# Multi-node disaggregated path: sbatch + Docker via submit.sh +# ============================================================================= +if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then + + scancel_sync() { + local jobid=$1 + local timeout=${2:-600} + local interval=10 + local start + start=$(date +%s) + + echo "[scancel_sync] Requesting cancel of job $jobid" + scancel "$jobid" || true + + while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do + local now + now=$(date +%s) + if (( now - start >= timeout )); then + echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s" + return 1 + fi + echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..." + sleep "$interval" + done + echo "[scancel_sync] job $jobid exited" + return 0 + } + + set -x + + export SLURM_ACCOUNT="$USER" + export SLURM_PARTITION="$PARTITION" + export SLURM_JOB_NAME="benchmark-sglang-disagg.job" + + export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}" + + # MODEL_YAML_KEY: top-level key in models.yaml for server config lookup. + if [[ -z "${MODEL_YAML_KEY:-}" ]]; then + export MODEL_YAML_KEY="${MODEL##*/}" + fi + + # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container. + # Auto-resolved from HF hub cache layout so no symlink is needed. + if [[ -z "${MODEL_NAME:-}" ]]; then + _HF_DIR="models--$(echo "${MODEL}" | tr '/' '--')" + _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1) + if [[ -n "${_SNAPSHOT}" ]]; then + export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}" + else + export MODEL_NAME="${MODEL_YAML_KEY}" + fi fi -" -srun --jobid=$JOB_ID \ ---container-image=$SQUASH_FILE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ ---container-writable \ ---container-remap-root \ ---container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh - -scancel $JOB_ID + + export GPUS_PER_NODE=8 + + export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + mkdir -p "$BENCHMARK_LOGS_DIR" + sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + + LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" + + sleep 10 + + while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + sleep 5 + done + + set +x + + ( + while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + sleep 10 + done + ) & + POLL_PID=$! + + tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + + wait $POLL_PID + + set -x + + cat > collect_latest_results.py <<'PY' +import os, sys +sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: + print(path) +PY + + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls -la "$LOGS_DIR" + + for result_file in $(find $LOGS_DIR -type f); do + file_name=$(basename $result_file) + if [ -f $result_file ]; then + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done + + echo "All result files processed" + set +x + scancel_sync $JOB_ID + set -x + echo "Canceled the slurm job $JOB_ID" + + sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + + if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$ARTIFACT_DIR" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true + echo "Logs copied to $ARTIFACT_DIR for artifact upload" + fi + +# ============================================================================= +# Single-node path: enroot via salloc + srun +# ============================================================================= +else + + SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" + + set -x + + JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + + if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 + fi + + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mount-home \ + --container-writable \ + --container-remap-root \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL \ + bash benchmarks/single_node/${SCRIPT_NAME} + + scancel $JOB_ID + +fi diff --git a/scripts/manual-test-mi325x.sh b/scripts/manual-test-mi325x.sh new file mode 100755 index 000000000..c232ded2a --- /dev/null +++ b/scripts/manual-test-mi325x.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "${BASH_SOURCE[0]}")/.." + +export GITHUB_WORKSPACE=$(pwd) +export RUNNER_NAME=mi325x-amd-manual + +export MODEL=deepseek-ai/DeepSeek-R1-0528 +export EXP_NAME=dsr1_1k1k +export PRECISION=fp8 +export FRAMEWORK=sglang-disagg + +export IMAGE=ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + +export ISL=1024 +export OSL=1024 +export CONC_LIST="4 2 1" +export SPEC_DECODING=none +export RANDOM_RANGE_RATIO=1 + +export PREFILL_NODES=1 +export PREFILL_NUM_WORKERS=1 +export PREFILL_TP=4 +export PREFILL_EP=1 +export PREFILL_DP_ATTN=false + +export DECODE_NODES=1 +export DECODE_NUM_WORKERS=1 +export DECODE_TP=8 +export DECODE_EP=1 +export DECODE_DP_ATTN=false + +bash runners/launch_mi325x-amd.sh + +#model files are here: +#/nfsdata/sa/gharunner/gharunners/hf-hub-cache/models--deepseek-ai--DeepSeek-R1-0528 \ No newline at end of file From 7b5047673f6e33a310754b85175f665fd9d5f08f Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Tue, 31 Mar 2026 10:36:30 -0700 Subject: [PATCH 02/41] Update amd-master.yaml --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 6da2a4e22..815023c55 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1233,7 +1233,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: dsr1-fp8-mi325x-sglang-disagg: - image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt + image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi325x-disagg From b40908ca814db79aea0248c641b0ee359e09c762 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 31 Mar 2026 17:50:20 +0000 Subject: [PATCH 03/41] Add MTP config, expand sweep to full pareto frontier, use -good image - Add dsr1-fp8-mi325x-sglang-disagg-mtp config with MTP=1/2 across all curve points (top/middle/bottom/low-conc) for both 1k/1k and 8k/1k - Expand concurrency lists to cover full pareto frontier including non-optimal points - Update image tag to v0.5.9-bnxt-good (the pushed image) Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 152 ++++++++++++++++++++++++++++++++ scripts/manual-test-mi325x.sh | 2 +- 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 815023c55..00b6a26de 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1380,3 +1380,155 @@ dsr1-fp8-mi325x-sglang-disagg: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + + +dsr1-fp8-mi325x-sglang-disagg-mtp: + image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) + - spec-decoding: "mtp" + conc-list: [ 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + - spec-decoding: "mtp" + conc-list: [ 768, 512, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + - spec-decoding: "mtp" + conc-list: [ 512, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" diff --git a/scripts/manual-test-mi325x.sh b/scripts/manual-test-mi325x.sh index c232ded2a..30ec87d6a 100755 --- a/scripts/manual-test-mi325x.sh +++ b/scripts/manual-test-mi325x.sh @@ -15,7 +15,7 @@ export IMAGE=ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good export ISL=1024 export OSL=1024 -export CONC_LIST="4 2 1" +export CONC_LIST="1024 512 256 128 64 32 16 8 4 2 1" export SPEC_DECODING=none export RANDOM_RANGE_RATIO=1 From 2421ca580cbb54491cd0bd12666ca1f660300908 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 31 Mar 2026 17:56:58 +0000 Subject: [PATCH 04/41] Add perf-changelog entry for MI325X disagg configs Co-Authored-By: Claude Opus 4.6 (1M context) --- perf-changelog.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 967edc19c..8e8ebc989 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,14 @@ +- config-keys: + - dsr1-fp8-mi325x-sglang-disagg + - dsr1-fp8-mi325x-sglang-disagg-mtp + description: + - "Add MI325X DeepSeek-R1 FP8 disaggregated inference with Broadcom Thor 2 IBGDA" + - "Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches" + - "Image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good" + - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k" + - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985 + - config-keys: - kimik2.5-int4-mi300x-vllm description: From 6abdf85570d876220480a317ba9635451ccb055f Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Wed, 1 Apr 2026 00:19:24 +0000 Subject: [PATCH 05/41] Fix MI325X QoS detection and NFS-safe cleanup for disagg benchmarks - Add chi-mi325x* hostname detection in env.sh for RDMA QoS config (MORI_RDMA_TC=104, MORI_RDMA_SL=3, derived from DCB DSCP AF31->prio 3) since nicctl is not available on Vultr/CPE MI325X hosts - Wrap sudo rm -rf calls with timeout 30s in launch_mi325x-amd.sh and job.slurm to prevent indefinite hangs on stale NFS locks Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/multi_node/amd_utils/env.sh | 10 ++++++++++ benchmarks/multi_node/amd_utils/job.slurm | 4 ++-- runners/launch_mi325x-amd.sh | 6 ++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 56572dfcf..99f2d0238 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -104,6 +104,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p { elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == chi-mi325x* ]]; then + # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104 + export MORI_RDMA_TC=104 + export MORI_RDMA_SL=3 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." fi @@ -117,6 +122,11 @@ else elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == chi-mi325x* ]]; then + # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104 + export MORI_RDMA_TC=104 + export MORI_RDMA_SL=3 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." echo " This is normal for clusters without QoS or outside Docker containers." diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 0e8f465f5..784161d06 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -299,8 +299,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) cleanup() { echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks + timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 4e76c205a..a21d2fd58 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -74,7 +74,8 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks + timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") @@ -137,7 +138,8 @@ PY set -x echo "Canceled the slurm job $JOB_ID" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks + timeout --kill-after=5 30 sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true if [[ -n "${GITHUB_ACTIONS:-}" ]]; then ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" From 37162588f5776b1e4282577f801da8fd93d7448c Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Wed, 1 Apr 2026 06:44:31 +0000 Subject: [PATCH 06/41] Add local NVMe model caching for faster model loading Pre-stage model weights from NFS/shared storage to local NVMe before the inference server starts. Reduces model load time for large models (e.g., DeepSeek-R1 ~340GB FP8) from NFS read speeds to NVMe speeds. - utils/setup_local_nvme.sh: One-time NVMe setup script for compute nodes (format, mount, fstab entry). Supports single drive or RAID-0. - utils/cache_model_locally.sh: Standalone/sourceable model caching utility using rsync with parallel blob sync for HF hub cache layout. - job.slurm: When LOCAL_MODEL_CACHE_DIR is set, runs srun-based parallel rsync on all nodes before Docker starts. Idempotent (skips if cached). Falls back to shared storage if caching fails. - launch_mi325x-amd.sh: Enable local caching at /local-nvme/models for MI325X cluster (8x 3.5TB NVMe per node). Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/multi_node/amd_utils/job.slurm | 61 +++++++++++ runners/launch_mi325x-amd.sh | 5 + utils/cache_model_locally.sh | 109 ++++++++++++++++++++ utils/setup_local_nvme.sh | 118 ++++++++++++++++++++++ 4 files changed, 293 insertions(+) create mode 100755 utils/cache_model_locally.sh create mode 100755 utils/setup_local_nvme.sh diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 784161d06..7c746b41a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -321,6 +321,67 @@ srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Optional: Pre-stage model to local NVMe for faster loading +# ============================================================================= +# LOCAL_MODEL_CACHE_DIR: mount point for fast local storage (NVMe/SSD) on compute nodes. +# Set per-cluster via the runner/launch script. When set, model weights are rsync'd +# from shared storage to local NVMe before Docker starts. This is idempotent — +# subsequent runs skip files already cached locally. +# +# If unset or the local path doesn't exist, the model is served directly from +# shared storage (NFS/Lustre) as before. +if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then + LOCAL_MODEL_FULL="${LOCAL_MODEL_CACHE_DIR}/${MODEL_NAME}" + echo "[cache] Pre-staging model to local NVMe on all nodes..." + echo "[cache] Source: $MODEL_PATH" + echo "[cache] Dest: $LOCAL_MODEL_FULL" + + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' + set -euo pipefail + SRC="'"$MODEL_PATH"'" + DST="'"$LOCAL_MODEL_FULL"'" + CACHE_DIR="'"${LOCAL_MODEL_CACHE_DIR}"'" + + # Create destination directory + sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR" + sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true + + SRC_COUNT=$(find "$SRC" -type f 2>/dev/null | wc -l) + DST_COUNT=$(find "$DST" -type f 2>/dev/null | wc -l) + + if [[ "$SRC_COUNT" -eq "$DST_COUNT" ]] && [[ "$DST_COUNT" -gt 0 ]]; then + echo "[cache] $(hostname): Already cached ($DST_COUNT files)" + else + echo "[cache] $(hostname): Syncing $SRC_COUNT files..." + START=$(date +%s) + + if [[ -d "$SRC/blobs" ]]; then + # HuggingFace hub cache layout: parallel-sync large blobs + mkdir -p "$DST/blobs" + find "$SRC/blobs" -type f -printf "%f\n" | \ + xargs -P '"${CACHE_PARALLEL_JOBS:-4}"' -I{} \ + rsync -a --whole-file --ignore-existing "$SRC/blobs/{}" "$DST/blobs/{}" + rsync -a --whole-file --ignore-existing --exclude="blobs/" "$SRC/" "$DST/" + else + # Flat model directory + rsync -a --whole-file --ignore-existing "$SRC/" "$DST/" + fi + + ELAPSED=$(( $(date +%s) - START )) + SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1) + echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)" + fi + ' 2>&1 + + if [[ $? -eq 0 ]]; then + echo "[cache] Model pre-staged successfully. Updating MODEL_DIR." + MODEL_DIR="${LOCAL_MODEL_CACHE_DIR}" + else + echo "[cache] WARNING: Local caching failed on some nodes. Falling back to shared storage." + fi +fi + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index a21d2fd58..107c68d7d 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -3,6 +3,11 @@ export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" export PORT=8888 +# Local NVMe cache for model weights (set to empty to disable) +# MI325X nodes have 8x 3.5TB NVMe drives; /local-nvme must be set up +# via: sudo bash utils/setup_local_nvme.sh /local-nvme +export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}" + PARTITION="compute" # Detect benchmark subdir from where the script lives diff --git a/utils/cache_model_locally.sh b/utils/cache_model_locally.sh new file mode 100755 index 000000000..37369d29e --- /dev/null +++ b/utils/cache_model_locally.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +# cache_model_locally.sh — Pre-stage model weights from shared storage to local NVMe. +# +# Syncs a model directory from NFS/shared storage to fast local NVMe before +# the inference server starts, dramatically reducing model load time. +# +# Usage: +# source utils/cache_model_locally.sh +# cache_model_locally "/nfs/models/deepseek-r1" "/local-nvme/models/deepseek-r1" +# +# Or as a standalone script: +# bash utils/cache_model_locally.sh /nfs/models/deepseek-r1 /local-nvme/models/deepseek-r1 +# +# Features: +# - Idempotent: skips files already present on the target +# - Preserves HuggingFace cache symlink structure +# - Concurrent execution safe (multiple nodes can cache simultaneously) +# - Configurable timeout to prevent NFS hangs +# - Works with both HF hub cache layout and flat model directories +# +# Environment variables: +# CACHE_PARALLEL_JOBS — number of parallel rsync jobs for large blobs (default: 4) +# CACHE_TIMEOUT — per-file timeout in seconds (default: 600) +# CACHE_DRY_RUN — set to 1 to print what would be synced without copying + +set -euo pipefail + +CACHE_PARALLEL_JOBS="${CACHE_PARALLEL_JOBS:-4}" +CACHE_TIMEOUT="${CACHE_TIMEOUT:-600}" +CACHE_DRY_RUN="${CACHE_DRY_RUN:-0}" + +cache_model_locally() { + local src="${1:?Usage: cache_model_locally }" + local dst="${2:?Usage: cache_model_locally }" + + if [[ ! -d "$src" ]]; then + echo "[cache] ERROR: Source path does not exist: $src" >&2 + return 1 + fi + + # Quick check: if dest has the same number of regular files, skip entirely + local src_count dst_count + src_count=$(find "$src" -type f 2>/dev/null | wc -l) + dst_count=$(find "$dst" -type f 2>/dev/null | wc -l) + + if [[ "$src_count" -eq "$dst_count" ]] && [[ "$dst_count" -gt 0 ]]; then + echo "[cache] Already cached: $dst ($dst_count files)" + echo "$dst" + return 0 + fi + + echo "[cache] Syncing model to local storage..." + echo "[cache] Source: $src" + echo "[cache] Dest: $dst" + echo "[cache] Parallel jobs: $CACHE_PARALLEL_JOBS" + + mkdir -p "$dst" + + local rsync_opts=(-a --whole-file --ignore-existing --info=name) + if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then + rsync_opts+=(--dry-run) + fi + + local start_time + start_time=$(date +%s) + + # Check if this is a HuggingFace hub cache directory (has blobs/ subdir) + if [[ -d "$src/blobs" ]]; then + echo "[cache] Detected HuggingFace hub cache layout" + + # Step 1: Parallel-sync the large blob files (the actual model weights) + mkdir -p "$dst/blobs" + find "$src/blobs" -type f -printf '%f\n' | \ + xargs -P "$CACHE_PARALLEL_JOBS" -I{} \ + timeout "$CACHE_TIMEOUT" rsync "${rsync_opts[@]}" "$src/blobs/{}" "$dst/blobs/{}" + + # Step 2: Sync everything else (symlinks in snapshots/, refs/, etc.) — fast + rsync "${rsync_opts[@]}" --exclude='blobs/' "$src/" "$dst/" + else + # Flat model directory: parallel-sync large files, then the rest + echo "[cache] Detected flat model directory" + + # Sync large files (>100MB) in parallel + find "$src" -type f -size +100M -printf '%P\n' | \ + xargs -P "$CACHE_PARALLEL_JOBS" -I{} bash -c \ + 'mkdir -p "$(dirname "'"$dst"'/{}")"; timeout '"$CACHE_TIMEOUT"' rsync '"$(printf '%q ' "${rsync_opts[@]}")"' "'"$src"'/{}" "'"$dst"'/{}"' + + # Sync remaining small files and symlinks + rsync "${rsync_opts[@]}" "$src/" "$dst/" + fi + + local elapsed=$(( $(date +%s) - start_time )) + local size + size=$(du -sh "$dst" 2>/dev/null | cut -f1) + + echo "[cache] Done in ${elapsed}s — $size cached at $dst" + echo "$dst" + return 0 +} + +# If run as a standalone script (not sourced), execute with args +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + if [[ $# -lt 2 ]]; then + echo "Usage: $0 " >&2 + echo " Env: CACHE_PARALLEL_JOBS=$CACHE_PARALLEL_JOBS CACHE_TIMEOUT=$CACHE_TIMEOUT" >&2 + exit 1 + fi + cache_model_locally "$1" "$2" +fi diff --git a/utils/setup_local_nvme.sh b/utils/setup_local_nvme.sh new file mode 100755 index 000000000..03b81e8a4 --- /dev/null +++ b/utils/setup_local_nvme.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# setup_local_nvme.sh — Format and mount local NVMe drives for model caching. +# +# Detects unformatted/unmounted NVMe drives and sets up a mount point for +# caching model weights locally. Designed to be run once per node (idempotent). +# +# Usage (run on each compute node, requires root): +# sudo bash utils/setup_local_nvme.sh [mount_point] +# +# Default mount point: /local-nvme +# +# This script: +# 1. Finds the first available NVMe drive that is not the boot device +# 2. Formats it with ext4 if not already formatted +# 3. Mounts it at the specified mount point +# 4. Adds an fstab entry for persistence across reboots +# +# For RAID-0 across multiple NVMe drives (maximum throughput), use: +# sudo bash utils/setup_local_nvme.sh --raid [mount_point] + +set -euo pipefail + +USE_RAID=false +MOUNT_POINT="/local-nvme" + +while [[ $# -gt 0 ]]; do + case "$1" in + --raid) USE_RAID=true; shift ;; + *) MOUNT_POINT="$1"; shift ;; + esac +done + +if [[ $EUID -ne 0 ]]; then + echo "ERROR: This script must be run as root (sudo)" >&2 + exit 1 +fi + +echo "[nvme-setup] Mount point: $MOUNT_POINT" + +# Already mounted? +if mountpoint -q "$MOUNT_POINT" 2>/dev/null; then + echo "[nvme-setup] $MOUNT_POINT is already mounted:" + df -h "$MOUNT_POINT" + exit 0 +fi + +# Find NVMe drives that are not part of the root filesystem +ROOT_DEV=$(findmnt -n -o SOURCE / | sed 's/[0-9]*$//' | sed 's/p$//') +NVME_DRIVES=() +for dev in /dev/nvme*n1; do + [[ -b "$dev" ]] || continue + # Skip if this drive is part of root + if [[ "$dev" == "$ROOT_DEV"* ]]; then + echo "[nvme-setup] Skipping $dev (root device)" + continue + fi + # Skip if already mounted + if mount | grep -q "^$dev "; then + echo "[nvme-setup] Skipping $dev (already mounted)" + continue + fi + # Skip if part of an md array + if grep -q "$(basename "$dev")" /proc/mdstat 2>/dev/null; then + echo "[nvme-setup] Skipping $dev (part of md array)" + continue + fi + NVME_DRIVES+=("$dev") +done + +if [[ ${#NVME_DRIVES[@]} -eq 0 ]]; then + echo "[nvme-setup] No available NVMe drives found." + exit 1 +fi + +echo "[nvme-setup] Found ${#NVME_DRIVES[@]} available NVMe drives: ${NVME_DRIVES[*]}" + +if [[ "$USE_RAID" == true ]] && [[ ${#NVME_DRIVES[@]} -gt 1 ]]; then + # RAID-0 for maximum throughput + MD_DEV="/dev/md10" + echo "[nvme-setup] Creating RAID-0 array across ${#NVME_DRIVES[@]} drives..." + + if [[ -b "$MD_DEV" ]]; then + echo "[nvme-setup] $MD_DEV already exists, using it" + else + mdadm --create "$MD_DEV" --level=0 --raid-devices=${#NVME_DRIVES[@]} "${NVME_DRIVES[@]}" --run + fi + + TARGET_DEV="$MD_DEV" +else + # Single drive (use the first available) + TARGET_DEV="${NVME_DRIVES[0]}" + echo "[nvme-setup] Using single drive: $TARGET_DEV" +fi + +# Format if needed +if ! blkid "$TARGET_DEV" | grep -q 'TYPE="ext4"'; then + echo "[nvme-setup] Formatting $TARGET_DEV with ext4..." + mkfs.ext4 -F -L local-nvme "$TARGET_DEV" +else + echo "[nvme-setup] $TARGET_DEV already has ext4 filesystem" +fi + +# Mount +mkdir -p "$MOUNT_POINT" +mount -o noatime,discard "$TARGET_DEV" "$MOUNT_POINT" + +# Set permissions so non-root users can write +chmod 1777 "$MOUNT_POINT" + +# Add fstab entry if not present +if ! grep -q "$MOUNT_POINT" /etc/fstab; then + UUID=$(blkid -s UUID -o value "$TARGET_DEV") + echo "UUID=$UUID $MOUNT_POINT ext4 noatime,discard,nofail 0 2" >> /etc/fstab + echo "[nvme-setup] Added fstab entry" +fi + +echo "[nvme-setup] Done:" +df -h "$MOUNT_POINT" From db677bd8fc2fc0a71ac57d67c35b88f6ccc06910 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Wed, 1 Apr 2026 06:54:51 +0000 Subject: [PATCH 07/41] Switch model caching from rsync to rclone sync Use rclone sync with --transfers 32 --checkers 32 --links for high-parallelism model pre-staging from NFS to local NVMe. rclone is now installed on all MI325X compute nodes (v1.73.3). Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/multi_node/amd_utils/job.slurm | 37 ++++-------- utils/cache_model_locally.sh | 71 ++++++----------------- 2 files changed, 30 insertions(+), 78 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 7c746b41a..523bfd7c5 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -347,31 +347,18 @@ if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR" sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true - SRC_COUNT=$(find "$SRC" -type f 2>/dev/null | wc -l) - DST_COUNT=$(find "$DST" -type f 2>/dev/null | wc -l) - - if [[ "$SRC_COUNT" -eq "$DST_COUNT" ]] && [[ "$DST_COUNT" -gt 0 ]]; then - echo "[cache] $(hostname): Already cached ($DST_COUNT files)" - else - echo "[cache] $(hostname): Syncing $SRC_COUNT files..." - START=$(date +%s) - - if [[ -d "$SRC/blobs" ]]; then - # HuggingFace hub cache layout: parallel-sync large blobs - mkdir -p "$DST/blobs" - find "$SRC/blobs" -type f -printf "%f\n" | \ - xargs -P '"${CACHE_PARALLEL_JOBS:-4}"' -I{} \ - rsync -a --whole-file --ignore-existing "$SRC/blobs/{}" "$DST/blobs/{}" - rsync -a --whole-file --ignore-existing --exclude="blobs/" "$SRC/" "$DST/" - else - # Flat model directory - rsync -a --whole-file --ignore-existing "$SRC/" "$DST/" - fi - - ELAPSED=$(( $(date +%s) - START )) - SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1) - echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)" - fi + echo "[cache] $(hostname): Syncing model to local NVMe..." + START=$(date +%s) + + rclone sync "$SRC/" "$DST/" \ + --transfers 32 \ + --checkers 32 \ + --links \ + --progress + + ELAPSED=$(( $(date +%s) - START )) + SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1) + echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)" ' 2>&1 if [[ $? -eq 0 ]]; then diff --git a/utils/cache_model_locally.sh b/utils/cache_model_locally.sh index 37369d29e..0b1480231 100755 --- a/utils/cache_model_locally.sh +++ b/utils/cache_model_locally.sh @@ -2,31 +2,30 @@ # cache_model_locally.sh — Pre-stage model weights from shared storage to local NVMe. # # Syncs a model directory from NFS/shared storage to fast local NVMe before -# the inference server starts, dramatically reducing model load time. +# the inference server starts, using rclone for high-parallelism transfers. # # Usage: # source utils/cache_model_locally.sh -# cache_model_locally "/nfs/models/deepseek-r1" "/local-nvme/models/deepseek-r1" +# cache_model_locally "/nfs/hub/models--org--repo" "/local-nvme/hub/models--org--repo" # # Or as a standalone script: -# bash utils/cache_model_locally.sh /nfs/models/deepseek-r1 /local-nvme/models/deepseek-r1 +# bash utils/cache_model_locally.sh /nfs/hub/models--org--repo /local-nvme/hub/models--org--repo # # Features: -# - Idempotent: skips files already present on the target -# - Preserves HuggingFace cache symlink structure -# - Concurrent execution safe (multiple nodes can cache simultaneously) -# - Configurable timeout to prevent NFS hangs +# - Uses rclone sync with 32 parallel transfers for maximum throughput +# - Preserves HuggingFace cache symlink structure (--links) +# - Idempotent: rclone skips files already present and identical # - Works with both HF hub cache layout and flat model directories # # Environment variables: -# CACHE_PARALLEL_JOBS — number of parallel rsync jobs for large blobs (default: 4) -# CACHE_TIMEOUT — per-file timeout in seconds (default: 600) -# CACHE_DRY_RUN — set to 1 to print what would be synced without copying +# CACHE_TRANSFERS — number of parallel rclone transfers (default: 32) +# CACHE_CHECKERS — number of parallel rclone checkers (default: 32) +# CACHE_DRY_RUN — set to 1 to print what would be synced without copying set -euo pipefail -CACHE_PARALLEL_JOBS="${CACHE_PARALLEL_JOBS:-4}" -CACHE_TIMEOUT="${CACHE_TIMEOUT:-600}" +CACHE_TRANSFERS="${CACHE_TRANSFERS:-32}" +CACHE_CHECKERS="${CACHE_CHECKERS:-32}" CACHE_DRY_RUN="${CACHE_DRY_RUN:-0}" cache_model_locally() { @@ -38,57 +37,23 @@ cache_model_locally() { return 1 fi - # Quick check: if dest has the same number of regular files, skip entirely - local src_count dst_count - src_count=$(find "$src" -type f 2>/dev/null | wc -l) - dst_count=$(find "$dst" -type f 2>/dev/null | wc -l) - - if [[ "$src_count" -eq "$dst_count" ]] && [[ "$dst_count" -gt 0 ]]; then - echo "[cache] Already cached: $dst ($dst_count files)" - echo "$dst" - return 0 - fi - echo "[cache] Syncing model to local storage..." echo "[cache] Source: $src" echo "[cache] Dest: $dst" - echo "[cache] Parallel jobs: $CACHE_PARALLEL_JOBS" + echo "[cache] Transfers: $CACHE_TRANSFERS, Checkers: $CACHE_CHECKERS" mkdir -p "$dst" - local rsync_opts=(-a --whole-file --ignore-existing --info=name) - if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then - rsync_opts+=(--dry-run) - fi - local start_time start_time=$(date +%s) - # Check if this is a HuggingFace hub cache directory (has blobs/ subdir) - if [[ -d "$src/blobs" ]]; then - echo "[cache] Detected HuggingFace hub cache layout" - - # Step 1: Parallel-sync the large blob files (the actual model weights) - mkdir -p "$dst/blobs" - find "$src/blobs" -type f -printf '%f\n' | \ - xargs -P "$CACHE_PARALLEL_JOBS" -I{} \ - timeout "$CACHE_TIMEOUT" rsync "${rsync_opts[@]}" "$src/blobs/{}" "$dst/blobs/{}" - - # Step 2: Sync everything else (symlinks in snapshots/, refs/, etc.) — fast - rsync "${rsync_opts[@]}" --exclude='blobs/' "$src/" "$dst/" - else - # Flat model directory: parallel-sync large files, then the rest - echo "[cache] Detected flat model directory" - - # Sync large files (>100MB) in parallel - find "$src" -type f -size +100M -printf '%P\n' | \ - xargs -P "$CACHE_PARALLEL_JOBS" -I{} bash -c \ - 'mkdir -p "$(dirname "'"$dst"'/{}")"; timeout '"$CACHE_TIMEOUT"' rsync '"$(printf '%q ' "${rsync_opts[@]}")"' "'"$src"'/{}" "'"$dst"'/{}"' - - # Sync remaining small files and symlinks - rsync "${rsync_opts[@]}" "$src/" "$dst/" + local rclone_opts=(--transfers "$CACHE_TRANSFERS" --checkers "$CACHE_CHECKERS" --links --progress) + if [[ "$CACHE_DRY_RUN" -eq 1 ]]; then + rclone_opts+=(--dry-run) fi + rclone sync "$src/" "$dst/" "${rclone_opts[@]}" + local elapsed=$(( $(date +%s) - start_time )) local size size=$(du -sh "$dst" 2>/dev/null | cut -f1) @@ -102,7 +67,7 @@ cache_model_locally() { if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then if [[ $# -lt 2 ]]; then echo "Usage: $0 " >&2 - echo " Env: CACHE_PARALLEL_JOBS=$CACHE_PARALLEL_JOBS CACHE_TIMEOUT=$CACHE_TIMEOUT" >&2 + echo " Env: CACHE_TRANSFERS=$CACHE_TRANSFERS CACHE_CHECKERS=$CACHE_CHECKERS" >&2 exit 1 fi cache_model_locally "$1" "$2" From 0a485de74d254f2dab6445b4891f69630a816872 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Wed, 1 Apr 2026 07:30:13 +0000 Subject: [PATCH 08/41] Add MTP baseline to single-node MI325X DeepSeek-R1 FP8 config Add spec-decoding: mtp search space entries alongside the existing non-MTP entries for both 1k/1k and 8k/1k sequence length configs. This provides a single-node MTP baseline for comparison with the disaggregated multi-node MTP results. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 00b6a26de..1ab86b8af 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -89,10 +89,12 @@ dsr1-fp8-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } dsr1-fp8-mi355x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi35x From 67dec7cfd088e7f8fb82afec88e1a3a21190c3b7 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Wed, 1 Apr 2026 07:36:09 +0000 Subject: [PATCH 09/41] Split MI325X single-node MTP into separate config key Separate dsr1-fp8-mi325x-sglang-mtp from the base config so it can be swept independently. Full sweeps still cover both via their respective config keys. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1ab86b8af..9fb5c53a0 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -89,11 +89,27 @@ dsr1-fp8-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi325x-sglang-mtp: + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } dsr1-fp8-mi355x-sglang: From f18257f6abba974895693a6b0dce6363234d3bbe Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 2 Apr 2026 03:39:54 +0000 Subject: [PATCH 10/41] Fix MI325X single-node script resolution and add MTP support The launcher's script name pattern included _${FRAMEWORK} suffix, but single-node scripts don't use framework suffixes (only multi-node disagg scripts do). This broke all MI325X single-node configs. Fix by trying framework-suffixed name for multi-node first, then falling back to the base name for single-node. Also add MTP speculative decoding support to the existing dsr1_fp8_mi325x.sh script and update perf-changelog with the single-node config keys. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/dsr1_fp8_mi325x.sh | 13 +++++++++++-- perf-changelog.yaml | 10 +++++++--- runners/launch_mi325x-amd.sh | 15 ++++++++++----- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index ae1e930f0..54760882b 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -26,6 +26,14 @@ hf download $MODEL export SGLANG_USE_AITER=1 export SGLANG_AITER_MLA_PERSIST=1 +# MTP (speculative decoding) flags +MTP_ARGS="" +CHAT_TEMPLATE_ARGS="" +if [[ "${SPEC_DECODING:-}" == "mtp" ]]; then + MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 1 --speculative-num-draft-tokens 2" + CHAT_TEMPLATE_ARGS="--use-chat-template" +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -47,7 +55,7 @@ python3 -m sglang.launch_server \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ --disable-radix-cache \ -$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +$MTP_ARGS $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -64,7 +72,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + $CHAT_TEMPLATE_ARGS # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8e8ebc989..d059c439b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,10 +1,14 @@ - config-keys: + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi325x-sglang-mtp - dsr1-fp8-mi325x-sglang-disagg - dsr1-fp8-mi325x-sglang-disagg-mtp description: - - "Add MI325X DeepSeek-R1 FP8 disaggregated inference with Broadcom Thor 2 IBGDA" - - "Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches" - - "Image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good" + - "Add MI325X DeepSeek-R1 FP8 single-node and disaggregated inference with Broadcom Thor 2 IBGDA" + - "Single-node: SGLang with aiter backend, MLA persist kernel, TP8, FP8 KV cache" + - "Disaggregated: Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches" + - "Image (disagg): ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good" + - "Image (single-node): lmsysorg/sglang:v0.5.9-rocm700-mi30x" - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k" - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985 diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 107c68d7d..6ac64f3d8 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -10,14 +10,19 @@ export LOCAL_MODEL_CACHE_DIR="${LOCAL_MODEL_CACHE_DIR:-/local-nvme/models}" PARTITION="compute" -# Detect benchmark subdir from where the script lives -SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh" -if [[ -f "benchmarks/multi_node/${SCRIPT_NAME}" ]]; then +# Detect benchmark subdir from where the script lives. +# Multi-node scripts include the framework suffix (e.g. _sglang-disagg.sh); +# single-node scripts do not (e.g. dsr1_fp8_mi325x.sh). +SCRIPT_NAME_WITH_FW="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh" +SCRIPT_NAME_BASE="${EXP_NAME%%_*}_${PRECISION}_mi325x.sh" +if [[ -f "benchmarks/multi_node/${SCRIPT_NAME_WITH_FW}" ]]; then BENCHMARK_SUBDIR="multi_node" -elif [[ -f "benchmarks/single_node/${SCRIPT_NAME}" ]]; then + SCRIPT_NAME="${SCRIPT_NAME_WITH_FW}" +elif [[ -f "benchmarks/single_node/${SCRIPT_NAME_BASE}" ]]; then BENCHMARK_SUBDIR="single_node" + SCRIPT_NAME="${SCRIPT_NAME_BASE}" else - echo "ERROR: ${SCRIPT_NAME} not found in benchmarks/multi_node or benchmarks/single_node" + echo "ERROR: neither benchmarks/multi_node/${SCRIPT_NAME_WITH_FW} nor benchmarks/single_node/${SCRIPT_NAME_BASE} found" exit 1 fi From 3ccfba3b1e0ea6146b2b760e30520df5d85332f8 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 2 Apr 2026 05:39:28 +0000 Subject: [PATCH 11/41] Fix decode dispatch token limit for DP attention disagg configs SGLang's DP attention mode overrides chunked_prefill_size to 1024, which must be <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. The default MORI_MAX_DISPATCH_TOKENS_DECODE of 160 is too small, causing an assertion failure on all EP8/DP decode configs (both MI325X and MI355X). Bump to 1024 when DP attention is enabled. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/multi_node/amd_utils/server.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index b477790b3..960cbb6e7 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -213,6 +213,13 @@ if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi +# DP attention forces chunked_prefill_size to 1024 inside SGLang, which must be +# <= SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK. Bump the decode dispatch +# token limit when DP is enabled to satisfy this assertion. +if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$MORI_MAX_DISPATCH_TOKENS_DECODE" -lt 1024 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=1024 +fi + # ============================================================================= # Cluster Topology Configuration # ============================================================================= From 021303211d473fa1ac25dbe870b14429459e1796 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 2 Apr 2026 18:15:38 +0000 Subject: [PATCH 12/41] Disable EP8/DP disagg configs on MI325X and bump MTP to 3 tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2 NICs) — servers start but all requests fail. Comment out these search-space entries for now. Bump DECODE_MTP_SIZE from 2 to 3 and speculative-num-steps from 1 to 3 for better low-concurrency decode throughput on CDNA3. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 232 +++++++++++----------- benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +- 2 files changed, 119 insertions(+), 115 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9fb5c53a0..68d2aec6e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1263,43 +1263,44 @@ dsr1-fp8-mi325x-sglang-disagg: - isl: 1024 osl: 1024 search-space: - # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) - - spec-decoding: "none" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 768, 512, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) + # - spec-decoding: "none" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + # - spec-decoding: "none" + # conc-list: [ 768, 512, 256 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) - spec-decoding: "none" @@ -1342,24 +1343,25 @@ dsr1-fp8-mi325x-sglang-disagg: - isl: 8192 osl: 1024 search-space: - # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) - - spec-decoding: "none" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + # - spec-decoding: "none" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=2" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) - spec-decoding: "none" @@ -1414,43 +1416,44 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: osl: 1024 search-space: # MTP configurations - # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) - - spec-decoding: "mtp" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) - - spec-decoding: "mtp" - conc-list: [ 768, 512, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) + # - spec-decoding: "mtp" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=1" + + # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + # - spec-decoding: "mtp" + # conc-list: [ 768, 512, 256 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=1" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) - spec-decoding: "mtp" @@ -1469,7 +1472,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=3" # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "mtp" @@ -1488,30 +1491,31 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=3" - isl: 8192 osl: 1024 search-space: # MTP configurations - # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) - - spec-decoding: "mtp" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" + # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2) + # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + # - spec-decoding: "mtp" + # conc-list: [ 512, 1024 ] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=2" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=1" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) - spec-decoding: "mtp" @@ -1530,7 +1534,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=3" # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "mtp" @@ -1549,4 +1553,4 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=3" diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index 54760882b..dc594a854 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -30,7 +30,7 @@ export SGLANG_AITER_MLA_PERSIST=1 MTP_ARGS="" CHAT_TEMPLATE_ARGS="" if [[ "${SPEC_DECODING:-}" == "mtp" ]]; then - MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 1 --speculative-num-draft-tokens 2" + MTP_ARGS="--speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-num-steps 3 --speculative-num-draft-tokens 4" CHAT_TEMPLATE_ARGS="--use-chat-template" fi From 2afb24a7c008e9f0b92d7509da43c194262d9014 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 2 Apr 2026 19:17:47 +0000 Subject: [PATCH 13/41] Add single-node EP8/DP test configs for MI325X disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds P(tp4) → D(tp8/ep8/dp, 1 node) search-space entries for both non-MTP and MTP disagg configs. This isolates whether EP/DP itself is broken on MI325X or if only the multi-node distributed init hangs with Broadcom Thor 2 NICs. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 68d2aec6e..adacf0203 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1340,6 +1340,26 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + # Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X + # or if only the multi-node distributed init is broken with Broadcom Thor 2) + - spec-decoding: "none" + conc-list: [ 512, 256, 128, 64 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + - isl: 8192 osl: 1024 search-space: @@ -1493,6 +1513,25 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" + # Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init) + - spec-decoding: "mtp" + conc-list: [ 512, 256, 128, 64 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + - isl: 8192 osl: 1024 search-space: From 36aebfd083760cb06db29b61a10560e5a9004f81 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 3 Apr 2026 20:11:24 +0000 Subject: [PATCH 14/41] Move container image to semianalysiswork Docker Hub and fix launcher bugs - Retag disagg image from ghcr.io/jordannanos/sgl-mi325x-mori to semianalysiswork/sgl-cdna3-mori (unified name for MI300X/MI325X, both gfx942 CDNA3) - Add empty JOB_ID guard in multi-node launcher path - Fix grep to use -qx for exact job ID matching (prevents substring collisions with sibling SLURM jobs) Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 4 ++-- perf-changelog.yaml | 2 +- runners/launch_mi325x-amd.sh | 9 +++++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index adacf0203..5c9d6ff41 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1251,7 +1251,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: dsr1-fp8-mi325x-sglang-disagg: - image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi325x-disagg @@ -1423,7 +1423,7 @@ dsr1-fp8-mi325x-sglang-disagg: dsr1-fp8-mi325x-sglang-disagg-mtp: - image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good + image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi325x-disagg diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d059c439b..e8f5c3bab 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -7,7 +7,7 @@ - "Add MI325X DeepSeek-R1 FP8 single-node and disaggregated inference with Broadcom Thor 2 IBGDA" - "Single-node: SGLang with aiter backend, MLA persist kernel, TP8, FP8 KV cache" - "Disaggregated: Custom container image built from akao-amd/sglang with MORI + bnxt_rocelib patches" - - "Image (disagg): ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good" + - "Image (disagg): semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt" - "Image (single-node): lmsysorg/sglang:v0.5.9-rocm700-mi30x" - "Full pareto sweep: non-MTP and MTP configs across 4 curve points, ISL 1k/1k and 8k/1k" - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker" diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 6ac64f3d8..89aa99934 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -89,12 +89,17 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + if [[ -z "$JOB_ID" ]]; then + echo "ERROR: benchmark script produced no job ID" + exit 1 + fi + LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" sleep 10 while ! ls "$LOG_FILE" &>/dev/null; do - if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then + if ! squeue -u "$USER" --noheader --format='%i' | grep -qx "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" scontrol show job "$JOB_ID" exit 1 @@ -105,7 +110,7 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then set +x ( - while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + while squeue -u $USER --noheader --format='%i' | grep -qx "$JOB_ID"; do sleep 10 done ) & From b5a0bc2a653ac4a848cbd8a23c8c69b5417c05bd Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Sat, 4 Apr 2026 05:05:18 +0000 Subject: [PATCH 15/41] Test EP8/DP workaround: drop MoRI a2a backend on MI325X bnxt_re Add DeepSeek-R1-0528-bnxt model config in models.yaml that removes --moe-a2a-backend mori from dp_flags while keeping MoRI for KV cache transfer. The EP8/DP test configs now use MODEL_YAML_KEY override to select this config, testing whether EP/DP works with default a2a kernels on Broadcom Thor 2 NICs. See sgl-project/sglang#22072 Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 10 ++++-- benchmarks/multi_node/amd_utils/models.yaml | 34 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5c9d6ff41..1d7e304a8 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1340,8 +1340,8 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - # Single-node EP8/DP decode (test: isolates whether EP/DP itself works on MI325X - # or if only the multi-node distributed init is broken with Broadcom Thor 2) + # Single-node EP8/DP decode — workaround: use default a2a kernels instead of + # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072 - spec-decoding: "none" conc-list: [ 512, 256, 128, 64 ] prefill: @@ -1351,6 +1351,7 @@ dsr1-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" decode: num-worker: 1 tp: 8 @@ -1359,6 +1360,7 @@ dsr1-fp8-mi325x-sglang-disagg: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - isl: 8192 osl: 1024 @@ -1513,7 +1515,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # Single-node EP8/DP decode with MTP (test: isolates EP/DP vs multi-node init) + # Single-node EP8/DP decode with MTP — workaround: default a2a kernels - spec-decoding: "mtp" conc-list: [ 512, 256, 128, 64 ] prefill: @@ -1523,6 +1525,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" decode: num-worker: 1 tp: 8 @@ -1531,6 +1534,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" + - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - isl: 8192 osl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 2bbdd91d6..60817e12e 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -130,6 +130,40 @@ DeepSeek-R1: chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" +# Workaround for MI325X Broadcom Thor 2 (bnxt_re): drop --moe-a2a-backend mori +# from dp_flags to test if EP/DP works with default a2a kernels while keeping +# MoRI for KV cache transfer. See sgl-project/sglang#22072 +DeepSeek-R1-0528-bnxt: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + DeepSeek-R1-0528: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" From beb3808bef9f39010b499ec173ca1d0149198655 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Sat, 4 Apr 2026 06:12:35 +0000 Subject: [PATCH 16/41] Fix MODEL_NAME for EP8/DP test configs with MODEL_YAML_KEY override MODEL_NAME falls back to MODEL_YAML_KEY when HF cache lookup fails (pre-existing tr bug with double-dash in org--repo). Explicitly set MODEL_NAME=DeepSeek-R1-0528 in additional-settings so the model path resolves correctly on disk. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1d7e304a8..7b28b5447 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1352,6 +1352,7 @@ dsr1-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + - "MODEL_NAME=DeepSeek-R1-0528" decode: num-worker: 1 tp: 8 @@ -1361,6 +1362,7 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + - "MODEL_NAME=DeepSeek-R1-0528" - isl: 8192 osl: 1024 @@ -1526,6 +1528,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + - "MODEL_NAME=DeepSeek-R1-0528" decode: num-worker: 1 tp: 8 @@ -1535,6 +1538,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + - "MODEL_NAME=DeepSeek-R1-0528" - isl: 8192 osl: 1024 From 23c293154c56009756cde8eadc9a99e92f23a07b Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Sat, 4 Apr 2026 07:13:17 +0000 Subject: [PATCH 17/41] fix: resolve MODEL_NAME from flat repo dir when HF snapshot absent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When MODEL_YAML_KEY differs from the actual model directory name (e.g. DeepSeek-R1-0528-bnxt key → DeepSeek-R1-0528 dir), the old fallback to MODEL_YAML_KEY produced a non-existent path, causing the EP8/DP workaround jobs to fail the model-availability check before even starting. The MI325X cluster stores models as flat directories named after the HF repo (e.g. hf-hub-cache/DeepSeek-R1-0528), not in HF hub cache snapshot layout. Add an intermediate check for that flat-dir format before falling back to MODEL_YAML_KEY. Co-Authored-By: Claude Sonnet 4.6 --- runners/launch_mi325x-amd.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 89aa99934..41b354705 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -75,6 +75,11 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1) if [[ -n "${_SNAPSHOT}" ]]; then export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}" + elif [[ -d "${MODEL_PATH}/${MODEL##*/}" ]]; then + # Cluster stores models as flat dirs named after the repo (e.g. DeepSeek-R1-0528), + # not in HF hub cache layout. Use repo name so MODEL_YAML_KEY can differ from + # the path (e.g. DeepSeek-R1-0528-bnxt yaml key → DeepSeek-R1-0528 dir). + export MODEL_NAME="${MODEL##*/}" else export MODEL_NAME="${MODEL_YAML_KEY}" fi From e5b9d00b8d97670f3b40408aca0ac5f9e5484a1c Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Sat, 4 Apr 2026 15:36:28 +0000 Subject: [PATCH 18/41] Tune EP8/DP test: lower concurrency + QP params for SQ full fix Previous EP8/DP test hit RDMA SQ overflow (max=4351) at high concurrency. Reduce conc to 4-64, set MORI_IO_QP_MAX_SEND_WR=4096 to stay within Broadcom bnxt_re hardware limits, and lower MORI_MAX_DISPATCH_TOKENS_DECODE. Disable MTP EP8/DP test (HSA GPU fault) to focus on non-MTP first. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 54 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7b28b5447..8f33d1116 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1340,10 +1340,10 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - # Single-node EP8/DP decode — workaround: use default a2a kernels instead of - # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072 + # Single-node EP8/DP decode — workaround: no MoRI a2a, reduced concurrency + # to avoid RDMA SQ overflow on Broadcom bnxt_re. See sgl-project/sglang#22072 - spec-decoding: "none" - conc-list: [ 512, 256, 128, 64 ] + conc-list: [ 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 4 @@ -1363,6 +1363,9 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - "MODEL_NAME=DeepSeek-R1-0528" + - "MORI_MAX_DISPATCH_TOKENS_DECODE=160" + - "MORI_IO_QP_MAX_SEND_WR=4096" + - "MORI_IO_QP_MAX_CQE=8192" - isl: 8192 osl: 1024 @@ -1517,28 +1520,29 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # Single-node EP8/DP decode with MTP — workaround: default a2a kernels - - spec-decoding: "mtp" - conc-list: [ 512, 256, 128, 64 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - - "MODEL_NAME=DeepSeek-R1-0528" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - - "MODEL_NAME=DeepSeek-R1-0528" + # DISABLED: MTP + EP8/DP hits HSA hardware exception (GPU kernel fault) on CDNA3. + # Focus on getting non-MTP EP8/DP working first. + # - spec-decoding: "mtp" + # conc-list: [ 64, 32, 16, 8, 4 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + # - "MODEL_NAME=DeepSeek-R1-0528" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=3" + # - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + # - "MODEL_NAME=DeepSeek-R1-0528" - isl: 8192 osl: 1024 From 76d89d0f7b39d118d41be66d8b33fb10091d14e0 Mon Sep 17 00:00:00 2001 From: Jordan Nanos Date: Sun, 5 Apr 2026 02:12:34 +0000 Subject: [PATCH 19/41] fix: lower bnxt_re QP limits and concurrency for MI325X EP8/DP disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Broadcom Thor 2 (bnxt_re) NICs cap SQ depth at ~4351 entries. The upstream MORI defaults (SEND_WR=16384, CQE=32768) cause SQ overflow under EP8 RDMA traffic. Lower to SEND_WR=4096 / CQE=8192 per sgl-project/sglang#22072. Also cap EP8/DP workaround concurrency at 64 (from 512) — bnxt_re saturates the SQ at higher concurrency with EP8's 8x RDMA traffic multiplier. Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 54 +++++++++++++------------- benchmarks/multi_node/amd_utils/env.sh | 7 +++- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8f33d1116..c6aa404a0 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1340,8 +1340,9 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - # Single-node EP8/DP decode — workaround: no MoRI a2a, reduced concurrency - # to avoid RDMA SQ overflow on Broadcom bnxt_re. See sgl-project/sglang#22072 + # Single-node EP8/DP decode — workaround: use default a2a kernels instead of + # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072 + # Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8. - spec-decoding: "none" conc-list: [ 64, 32, 16, 8, 4 ] prefill: @@ -1363,9 +1364,6 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - "MODEL_NAME=DeepSeek-R1-0528" - - "MORI_MAX_DISPATCH_TOKENS_DECODE=160" - - "MORI_IO_QP_MAX_SEND_WR=4096" - - "MORI_IO_QP_MAX_CQE=8192" - isl: 8192 osl: 1024 @@ -1520,29 +1518,29 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # DISABLED: MTP + EP8/DP hits HSA hardware exception (GPU kernel fault) on CDNA3. - # Focus on getting non-MTP EP8/DP working first. - # - spec-decoding: "mtp" - # conc-list: [ 64, 32, 16, 8, 4 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - # - "MODEL_NAME=DeepSeek-R1-0528" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=1" - # - "DECODE_MTP_SIZE=3" - # - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - # - "MODEL_NAME=DeepSeek-R1-0528" + # Single-node EP8/DP decode with MTP — workaround: default a2a kernels + # Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8. + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + - "MODEL_NAME=DeepSeek-R1-0528" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" + - "MODEL_NAME=DeepSeek-R1-0528" - isl: 8192 osl: 1024 diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 99f2d0238..9e2942ecf 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -67,8 +67,11 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 +# Broadcom bnxt_re NICs cap SQ depth at ~4351 entries. Lower from upstream +# defaults (16384/32768) to avoid SQ overflow under EP8 RDMA traffic. +# See sgl-project/sglang#22072 +export MORI_IO_QP_MAX_SEND_WR=4096 +export MORI_IO_QP_MAX_CQE=8192 export MORI_IO_QP_MAX_SGE=4 export MORI_APP_LOG_LEVEL=INFO From 4d9ee30dbfb0e1bab10505cace365397f0305844 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 9 Apr 2026 17:31:57 +0000 Subject: [PATCH 20/41] Add GLM-5 FP8 single-node benchmark for MI325X SGLang with NSA tilelang backends, TP8, conc 4-64. Based on MI355X GLM-5 script with MI325X aiter/MLA persist tuning. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 18 +++++ benchmarks/single_node/glm5_fp8_mi325x.sh | 80 +++++++++++++++++++++++ perf-changelog.yaml | 8 +++ 3 files changed, 106 insertions(+) create mode 100755 benchmarks/single_node/glm5_fp8_mi325x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c6aa404a0..2b5479564 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -112,6 +112,24 @@ dsr1-fp8-mi325x-sglang-mtp: search-space: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } +glm5-fp8-mi325x-sglang: + image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + dsr1-fp8-mi355x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi35x model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/single_node/glm5_fp8_mi325x.sh b/benchmarks/single_node/glm5_fp8_mi325x.sh new file mode 100755 index 000000000..1a35d7fc8 --- /dev/null +++ b/benchmarks/single_node/glm5_fp8_mi325x.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# GLM-5 requires transformers with glm_moe_dsa model type support. +python3 -m pip install -U --no-cache-dir \ + "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" + +hf download "$MODEL" + +# ROCm / SGLang performance tuning for MI325X +export SGLANG_USE_AITER=1 +export SGLANG_AITER_MLA_PERSIST=1 +export SGLANG_ROCM_FUSED_DECODE_MLA=0 +export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export SAFETENSORS_FAST_GPU=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +python3 -m sglang.launch_server \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --trust-remote-code \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --mem-fraction-static 0.85 \ + --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ + --nsa-prefill-backend tilelang \ + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e8f5c3bab..9eab938a9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -13,6 +13,14 @@ - "Dockerfile patches: https://github.com/JordanNanos/sglang/tree/main/docker" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985 +- config-keys: + - glm5-fp8-mi325x-sglang + description: + - "Add GLM-5 FP8 single-node MI325X SGLang benchmark (TP8)" + - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends" + - "Image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985 + - config-keys: - kimik2.5-int4-mi300x-vllm description: From 13c11670d4d52a2b1edfd0bdca7e71536152372c Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 9 Apr 2026 20:18:30 +0000 Subject: [PATCH 21/41] Skip HF download validation when model is cached on MI325X Avoids stale NFS file lock errors when multiple runners hit the shared HF cache simultaneously. Sets HF_HUB_OFFLINE=1 if the model directory already exists. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/single_node/glm5_fp8_mi325x.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/glm5_fp8_mi325x.sh b/benchmarks/single_node/glm5_fp8_mi325x.sh index 1a35d7fc8..5ff4b9e5a 100755 --- a/benchmarks/single_node/glm5_fp8_mi325x.sh +++ b/benchmarks/single_node/glm5_fp8_mi325x.sh @@ -19,7 +19,13 @@ fi python3 -m pip install -U --no-cache-dir \ "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" -hf download "$MODEL" +# Skip HF hub validation if model is already cached (avoids stale NFS lock issues) +if [[ -d "${HF_HUB_CACHE}/models--$(echo "$MODEL" | tr '/' '--')" ]]; then + echo "Model already cached, skipping download" + export HF_HUB_OFFLINE=1 +else + hf download "$MODEL" +fi # ROCm / SGLang performance tuning for MI325X export SGLANG_USE_AITER=1 From d4d6e1986307b15a4b3c57f79562ab489da67b02 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 9 Apr 2026 22:50:35 +0000 Subject: [PATCH 22/41] Add Qwen3.5 and GLM-5 FP8 disaggregated inference for MI325X Port MI355X disagg configs (from chun-chang/sglang-disagg-qwen3.5) to MI325X with Broadcom Thor 2 adaptations: - TP-only configs (1P2D TP8, 1P1D TP4/TP8) enabled - EP8/DP multi-node configs commented out (MoRI a2a hangs on bnxt_re) - Single-node EP8/DP workaround included (no MoRI a2a, low conc) - Model entries in models.yaml + bnxt variants for EP/DP workaround - GLM-5 transformers patch and env tuning in server.sh/env.sh Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 354 ++++++++++++++++++ benchmarks/multi_node/amd_utils/env.sh | 7 + benchmarks/multi_node/amd_utils/models.yaml | 125 +++++++ benchmarks/multi_node/amd_utils/server.sh | 7 + .../glm5_fp8_mi325x_sglang-disagg.sh | 82 ++++ .../qwen3.5_fp8_mi325x_sglang-disagg.sh | 82 ++++ perf-changelog.yaml | 7 +- 7 files changed, 662 insertions(+), 2 deletions(-) create mode 100755 benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh create mode 100755 benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2b5479564..25bcf30ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1621,3 +1621,357 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" + +qwen3.5-fp8-mi325x-sglang-disagg: + image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi325x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # DISABLED: EP8/DP configs — MoRI a2a hangs on Broadcom bnxt_re (sgl-project/sglang#22072) + # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP16) + # - spec-decoding: "none" + # conc-list: [ 1024, 2048 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + # - spec-decoding: "none" + # conc-list: [ 1536, 1024, 512 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # Single-node EP8/DP workaround: no MoRI a2a, low conc + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" + - "MORI_MAX_DISPATCH_TOKENS_DECODE=160" + - "MORI_IO_QP_MAX_SEND_WR=4096" + - "MORI_IO_QP_MAX_CQE=8192" + + - isl: 8192 + osl: 1024 + search-space: + # DISABLED: EP8/DP (sgl-project/sglang#22072) + # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + # - spec-decoding: "none" + # conc-list: [ 1024, 2048 ] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=2" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +glm5-fp8-mi325x-sglang-disagg: + image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi325x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # DISABLED: EP8/DP configs — MoRI a2a hangs on Broadcom bnxt_re (sgl-project/sglang#22072) + # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP16) + # - spec-decoding: "none" + # conc-list: [ 1024, 2048 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) + # - spec-decoding: "none" + # conc-list: [ 1536, 1024, 512 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=2" + # - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # Single-node EP8/DP workaround: no MoRI a2a, low conc + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" + - "MODEL_NAME=GLM-5-FP8" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" + - "MODEL_NAME=GLM-5-FP8" + - "MORI_MAX_DISPATCH_TOKENS_DECODE=160" + - "MORI_IO_QP_MAX_SEND_WR=4096" + - "MORI_IO_QP_MAX_CQE=8192" + + - isl: 8192 + osl: 1024 + search-space: + # DISABLED: EP8/DP (sgl-project/sglang#22072) + # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) + # - spec-decoding: "none" + # conc-list: [ 1024, 2048 ] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "PREFILL_NODES=2" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_NODES=1" + # - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 9e2942ecf..1d0e376e5 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -45,6 +45,13 @@ export SGLANG_USE_AITER=1 export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 +# GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading +if [[ "$MODEL_NAME" == *GLM-5* ]]; then + export SGLANG_ROCM_FUSED_DECODE_MLA=0 + export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + export SAFETENSORS_FAST_GPU=1 +fi + # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION export SGLANG_MORI_FP8_DISP=True diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 60817e12e..e2d5e78eb 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -256,3 +256,128 @@ DeepSeek-R1-0528-MXFP4: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" +Qwen3.5-397B-A17B-FP8: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "" + dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + +# MI325X bnxt workaround: no MoRI a2a for EP/DP (sgl-project/sglang#22072) +Qwen3.5-397B-A17B-FP8-bnxt: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + +GLM-5-FP8: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\"enable_multithread_load\": true, \"num_threads\": 8}'" + mtp_flags: "" + dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + +# MI325X bnxt workaround: no MoRI a2a for EP/DP (sgl-project/sglang#22072) +GLM-5-FP8-bnxt: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\"enable_multithread_load\": true, \"num_threads\": 8}'" + mtp_flags: "" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 960cbb6e7..1384d6c9d 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -11,6 +11,13 @@ NODE_RANK="${NODE_RANK:-0}" MODEL_DIR="${MODEL_DIR:-}" MODEL_NAME="${MODEL_NAME:-}" +# GLM-5 requires transformers with glm_moe_dsa model type support +if [[ "$MODEL_NAME" == *GLM-5* ]]; then + echo "[setup] Installing transformers for GLM-5 model type support..." + pip install -U --no-cache-dir \ + "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" 2>&1 | tail -3 +fi + xP="${xP:-1}" #-> Number of Prefill Workers yD="${yD:-1}" #-> Number of Decode Workers diff --git a/benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh new file mode 100755 index 000000000..6a7314ab4 --- /dev/null +++ b/benchmarks/multi_node/glm5_fp8_mi325x_sglang-disagg.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh b/benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh new file mode 100755 index 000000000..6a7314ab4 --- /dev/null +++ b/benchmarks/multi_node/qwen3.5_fp8_mi325x_sglang-disagg.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9eab938a9..d63ffae43 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -15,10 +15,13 @@ - config-keys: - glm5-fp8-mi325x-sglang + - glm5-fp8-mi325x-sglang-disagg + - qwen3.5-fp8-mi325x-sglang-disagg description: - - "Add GLM-5 FP8 single-node MI325X SGLang benchmark (TP8)" - - "Model: zai-org/GLM-5-FP8 with NSA tilelang backends" + - "Add GLM-5 and Qwen3.5 FP8 MI325X benchmarks (single-node + disaggregated)" + - "Disagg: TP-only configs + single-node EP8/DP workaround (no MoRI a2a)" - "Image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt" + - "Ported from MI355X disagg configs (chun-chang/sglang-disagg-qwen3.5)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/985 - config-keys: From 5228c620885cc1e7119d99140bdf42980395fd5f Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Thu, 9 Apr 2026 23:01:33 +0000 Subject: [PATCH 23/41] Fix HF cache path resolution: use sed instead of tr for org/repo separator tr '/' '--' converts org/repo to org-repo (single dash), but HF hub cache uses org--repo (double dash). Use sed 's|/|--|g' to produce the correct models--org--repo directory name. Co-Authored-By: Claude Opus 4.6 (1M context) --- runners/launch_mi325x-amd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 41b354705..367ffad37 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -71,7 +71,7 @@ if [[ "$BENCHMARK_SUBDIR" == "multi_node" ]]; then # MODEL_NAME: relative path under MODEL_PATH for --model-path inside the container. # Auto-resolved from HF hub cache layout so no symlink is needed. if [[ -z "${MODEL_NAME:-}" ]]; then - _HF_DIR="models--$(echo "${MODEL}" | tr '/' '--')" + _HF_DIR="models--$(echo "${MODEL}" | sed 's|/|--|g')" _SNAPSHOT=$(ls "${MODEL_PATH}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1) if [[ -n "${_SNAPSHOT}" ]]; then export MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}" From b08abaf8d6a9d4b5320e9610c8fe0f28702f5087 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 05:40:48 +0000 Subject: [PATCH 24/41] Sanitize MODEL_NAME in Docker container name HF cache paths (models--org--repo/snapshots/hash) contain slashes which are invalid in Docker container names. Sanitize MODEL_NAME the same way as USER_NAME using tr. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/multi_node/amd_utils/job.slurm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 523bfd7c5..9634c7f1f 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -289,7 +289,8 @@ export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +SANITIZED_MODEL=$(echo "$MODEL_NAME" | tr -c 'a-zA-Z0-9_.-' '_') +export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${SANITIZED_MODEL}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" From 6dbaa19723c3120c5caf0f93ba45cbc7d85b69bb Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 05:49:11 +0000 Subject: [PATCH 25/41] Force-reinstall transformers for GLM-5 in disagg Docker containers Add --force-reinstall and diagnostic output to verify glm_moe_dsa model type is available after install. Also match on MODEL_YAML_KEY in case MODEL_NAME doesn't contain GLM-5. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/multi_node/amd_utils/server.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 1384d6c9d..91571529b 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -12,10 +12,11 @@ MODEL_DIR="${MODEL_DIR:-}" MODEL_NAME="${MODEL_NAME:-}" # GLM-5 requires transformers with glm_moe_dsa model type support -if [[ "$MODEL_NAME" == *GLM-5* ]]; then +if [[ "$MODEL_NAME" == *GLM-5* ]] || [[ "$MODEL_YAML_KEY" == *GLM-5* ]]; then echo "[setup] Installing transformers for GLM-5 model type support..." - pip install -U --no-cache-dir \ - "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" 2>&1 | tail -3 + pip install -U --no-cache-dir --force-reinstall \ + "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" 2>&1 | tail -5 + python3 -c "import transformers; print(f'[setup] transformers {transformers.__version__}, glm_moe_dsa supported: {hasattr(transformers, \"GlmMoeDsaConfig\")}')" 2>&1 fi xP="${xP:-1}" #-> Number of Prefill Workers From 2c24d0d172f468c309d4ec87f82c88fe39be8787 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 06:31:53 +0000 Subject: [PATCH 26/41] Switch GLM-5 MI325X configs to v0.5.10 image The v0.5.9 image doesn't recognize glm_moe_dsa model type even with runtime transformers pip install (SGLang caches the old config registry). Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt which has newer transformers built in. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 25bcf30ca..67e0a04ac 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi325x-sglang: - image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x @@ -1800,7 +1800,7 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" glm5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x-disagg From d3522ec7d657e963512eb1aa72674a26751b9348 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 07:12:02 +0000 Subject: [PATCH 27/41] Switch GLM-5 MI325X to MI355X GLM-5 image (rocm/sgl-dev mori-0402) The semianalysiswork images don't have glm_moe_dsa in SGLang's config mapping. Use the MI355X GLM-5 disagg image which was specifically built with GLM-5 support. Both MI355X and MI325X are gfx942 CDNA3. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 67e0a04ac..56250f5e8 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi325x-sglang: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x @@ -1800,7 +1800,7 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" glm5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x-disagg From 5dd235fd95fde84d9a508820e6e35df56a7961f7 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 15:40:12 +0000 Subject: [PATCH 28/41] Switch Qwen3.5/GLM-5 disagg to v0.5.10 image + no-MoRI transfer Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt image and drop --disaggregation-transfer-backend mori for Qwen3.5/GLM-5 disagg. The v0.5.9 MoRI image's router never became ready for non-DeepSeek models. Using no-MoRI model variants (Qwen3.5-397B-A17B-FP8-no-mori, GLM-5-FP8-no-mori) to bypass MoRI entirely on bnxt_re. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 70 ++++++++++++++++++++- benchmarks/multi_node/amd_utils/models.yaml | 64 +++++++++++++++++++ 2 files changed, 131 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 56250f5e8..acb2e7db3 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi325x-sglang: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402 + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x @@ -1623,7 +1623,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=3" qwen3.5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi325x-disagg @@ -1646,6 +1646,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1653,6 +1655,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # - "DECODE_MTP_SIZE=0" # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) @@ -1665,6 +1669,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # decode: # num-worker: 2 # tp: 8 @@ -1672,6 +1678,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1684,6 +1692,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 2 tp: 8 @@ -1691,6 +1701,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) @@ -1703,6 +1715,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 1 tp: 8 @@ -1710,6 +1724,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" # Single-node EP8/DP workaround: no MoRI a2a, low conc @@ -1722,6 +1738,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: @@ -1731,6 +1749,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: true additional-settings: - "DECODE_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" @@ -1752,6 +1772,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "PREFILL_NODES=2" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1759,6 +1781,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1771,6 +1795,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 2 tp: 8 @@ -1778,6 +1804,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) @@ -1790,6 +1818,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 1 tp: 8 @@ -1797,10 +1827,12 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" glm5-fp8-mi325x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0402 + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x-disagg @@ -1823,6 +1855,8 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1830,6 +1864,8 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # - "DECODE_MTP_SIZE=0" # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) @@ -1842,6 +1878,8 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # decode: # num-worker: 2 # tp: 8 @@ -1849,6 +1887,8 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1861,6 +1901,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 2 tp: 8 @@ -1868,6 +1910,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) @@ -1880,6 +1924,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 1 tp: 8 @@ -1887,6 +1933,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" # Single-node EP8/DP workaround: no MoRI a2a, low conc @@ -1899,6 +1947,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" - "MODEL_NAME=GLM-5-FP8" decode: @@ -1908,6 +1958,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: true additional-settings: - "DECODE_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" - "MODEL_NAME=GLM-5-FP8" @@ -1929,6 +1981,8 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "PREFILL_NODES=2" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1936,6 +1990,8 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1948,6 +2004,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 2 tp: 8 @@ -1955,6 +2013,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) @@ -1967,6 +2027,8 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 1 tp: 8 @@ -1974,4 +2036,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index e2d5e78eb..8b6686338 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -381,3 +381,67 @@ GLM-5-FP8-bnxt: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +# MI325X no-MoRI variants: drop disaggregation-transfer-backend mori entirely +# Use if router never becomes ready with MoRI KV transfer on bnxt_re +Qwen3.5-397B-A17B-FP8-no-mori: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" + mtp_flags: "" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: 262144 + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + +GLM-5-FP8-no-mori: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\"enable_multithread_load\": true, \"num_threads\": 8}'" + mtp_flags: "" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: 262144 + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" From d8abc66ee15f4bfeb96bc3a5aa6ae6cd119850b3 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 15:40:12 +0000 Subject: [PATCH 29/41] Switch Qwen3.5/GLM-5 disagg to v0.5.10 image + no-MoRI transfer Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt image and drop --disaggregation-transfer-backend mori for Qwen3.5/GLM-5 disagg. The v0.5.9 MoRI image's router never became ready for non-DeepSeek models. Using no-MoRI model variants to bypass MoRI entirely. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index acb2e7db3..7c13f91f2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1704,6 +1704,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1727,6 +1729,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # Single-node EP8/DP workaround: no MoRI a2a, low conc - spec-decoding: "none" @@ -1807,6 +1811,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1830,6 +1836,8 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" + - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" glm5-fp8-mi325x-sglang-disagg: image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt @@ -1913,6 +1921,8 @@ glm5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1936,6 +1946,8 @@ glm5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # Single-node EP8/DP workaround: no MoRI a2a, low conc - spec-decoding: "none" @@ -2016,6 +2028,8 @@ glm5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -2039,3 +2053,5 @@ glm5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" + - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" + - "MODEL_NAME=GLM-5-FP8" From 44780e06d2901cd9a7a43ee19d8735b4205994be Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 16:06:56 +0000 Subject: [PATCH 30/41] Fix YAML: switch Qwen3.5/GLM-5 disagg to v0.5.10 + no-MoRI transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt and drop MoRI transfer backend for Qwen3.5/GLM-5 disagg on MI325X. Previous sed edit broke YAML indentation — this commit fixes it with validated Python-based editing. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 48 --------------------------------- 1 file changed, 48 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7c13f91f2..3dfa3ef5c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1646,8 +1646,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1655,8 +1653,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # - "DECODE_MTP_SIZE=0" # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) @@ -1669,8 +1665,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # decode: # num-worker: 2 # tp: 8 @@ -1678,8 +1672,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1701,8 +1693,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" @@ -1726,8 +1716,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" @@ -1742,8 +1730,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: @@ -1753,8 +1739,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: true additional-settings: - "DECODE_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" @@ -1776,8 +1760,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "PREFILL_NODES=2" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1785,8 +1767,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1808,8 +1788,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" @@ -1833,8 +1811,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" @@ -1863,8 +1839,6 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # decode: # num-worker: 1 # tp: 8 @@ -1872,8 +1846,6 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # - "DECODE_MTP_SIZE=0" # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) @@ -1886,8 +1858,6 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: false # additional-settings: # - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # decode: # num-worker: 2 # tp: 8 @@ -1895,8 +1865,6 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=2" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -1918,8 +1886,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" @@ -1943,8 +1909,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" @@ -1959,8 +1923,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" - "MODEL_NAME=GLM-5-FP8" decode: @@ -1970,8 +1932,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: true additional-settings: - "DECODE_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" - "MODEL_NAME=GLM-5-FP8" @@ -1993,8 +1953,6 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "PREFILL_NODES=2" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # decode: # num-worker: 1 # tp: 8 @@ -2002,8 +1960,6 @@ glm5-fp8-mi325x-sglang-disagg: # dp-attn: true # additional-settings: # - "DECODE_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) @@ -2025,8 +1981,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" @@ -2050,8 +2004,6 @@ glm5-fp8-mi325x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - "MODEL_NAME=GLM-5-FP8" From 21ce11aaf7b454ad5a64f23c1f6f95a30ac2263f Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 16:11:25 +0000 Subject: [PATCH 31/41] =?UTF-8?q?Remove=20MODEL=5FNAME=20overrides=20?= =?UTF-8?q?=E2=80=94=20let=20launcher=20resolve=20HF=20cache=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MODEL_NAME=GLM-5-FP8 doesn't exist on disk; the actual path is models--zai-org--GLM-5-FP8/snapshots/. The launcher's sed fix already resolves this correctly when MODEL_NAME is unset. Only MODEL_YAML_KEY override is needed for the no-MoRI model config. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3dfa3ef5c..8c00326e7 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1371,7 +1371,6 @@ dsr1-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - - "MODEL_NAME=DeepSeek-R1-0528" decode: num-worker: 1 tp: 8 @@ -1381,7 +1380,6 @@ dsr1-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - - "MODEL_NAME=DeepSeek-R1-0528" - isl: 8192 osl: 1024 @@ -1548,7 +1546,6 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - - "MODEL_NAME=DeepSeek-R1-0528" decode: num-worker: 1 tp: 8 @@ -1558,7 +1555,6 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt" - - "MODEL_NAME=DeepSeek-R1-0528" - isl: 8192 osl: 1024 @@ -1685,7 +1681,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 2 tp: 8 @@ -1695,7 +1690,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1708,7 +1702,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 1 tp: 8 @@ -1718,7 +1711,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # Single-node EP8/DP workaround: no MoRI a2a, low conc - spec-decoding: "none" @@ -1731,7 +1723,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 1 tp: 8 @@ -1741,7 +1732,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-bnxt" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" - "MORI_MAX_DISPATCH_TOKENS_DECODE=160" - "MORI_IO_QP_MAX_SEND_WR=4096" - "MORI_IO_QP_MAX_CQE=8192" @@ -1780,7 +1770,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 2 tp: 8 @@ -1790,7 +1779,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1803,7 +1791,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" decode: num-worker: 1 tp: 8 @@ -1813,7 +1800,6 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - - "MODEL_NAME=Qwen3.5-397B-A17B-FP8" glm5-fp8-mi325x-sglang-disagg: image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt @@ -1878,7 +1864,6 @@ glm5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 2 tp: 8 @@ -1888,7 +1873,6 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1901,7 +1885,6 @@ glm5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 1 tp: 8 @@ -1911,7 +1894,6 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # Single-node EP8/DP workaround: no MoRI a2a, low conc - spec-decoding: "none" @@ -1924,7 +1906,6 @@ glm5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" - - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 1 tp: 8 @@ -1934,7 +1915,6 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-bnxt" - - "MODEL_NAME=GLM-5-FP8" - "MORI_MAX_DISPATCH_TOKENS_DECODE=160" - "MORI_IO_QP_MAX_SEND_WR=4096" - "MORI_IO_QP_MAX_CQE=8192" @@ -1973,7 +1953,6 @@ glm5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 2 tp: 8 @@ -1983,7 +1962,6 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) - spec-decoding: "none" @@ -1996,7 +1974,6 @@ glm5-fp8-mi325x-sglang-disagg: additional-settings: - "PREFILL_NODES=1" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" decode: num-worker: 1 tp: 8 @@ -2006,4 +1983,3 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - - "MODEL_NAME=GLM-5-FP8" From fc2f0d9dabaea6e440c806967379b689ab3e7604 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 17:59:15 +0000 Subject: [PATCH 32/41] Fix TP mismatch for non-MLA models in Qwen3.5/GLM-5 disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-MLA models (Qwen3.5, GLM-5) require matched TP sizes between prefill and decode for PD disaggregation. Changed P(tp4)→D(tp8) to P(tp8)→D(tp8) for "Small scale" configs. DeepSeek-R1 (MLA) can use mismatched TP because of compressed latent representations. See sgl-project/sglang#15674 Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8c00326e7..52d7c533c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1691,12 +1691,12 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + # "Small scale" (1P1D both TP8 — non-MLA models need matched TP) - spec-decoding: "none" conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 dp-attn: false additional-settings: @@ -1780,12 +1780,12 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" - # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + # "Small scale" (1P1D both TP8 — non-MLA models need matched TP) - spec-decoding: "none" conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 dp-attn: false additional-settings: @@ -1874,12 +1874,12 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + # "Small scale" (1P1D both TP8 — non-MLA models need matched TP) - spec-decoding: "none" conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 dp-attn: false additional-settings: @@ -1963,12 +1963,12 @@ glm5-fp8-mi325x-sglang-disagg: - "DECODE_MTP_SIZE=0" - "MODEL_YAML_KEY=GLM-5-FP8-no-mori" - # "Small scale" (1 prefill worker at TP4, 1 decode worker at TP8) + # "Small scale" (1P1D both TP8 — non-MLA models need matched TP) - spec-decoding: "none" conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 dp-attn: false additional-settings: From c956ce21d78bf843b5e678143d25c2a11bc5e5d2 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 18:26:03 +0000 Subject: [PATCH 33/41] Add MI325X container image build scripts and documentation Build script for semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori: - SGLang v0.5.10 (supports Qwen3.5, GLM-5, DeepSeek-R1) - Broadcom Thor 2 bnxt_rocelib for IBGDA/RoCEv2 - MoRI disaggregated inference - Based on JordanNanos/sglang fork's rocm.Dockerfile Includes sbatch script for building on the MI325X cluster and documentation of build args, prerequisites, and known issues. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/README-mi325x.md | 78 +++++++++++++++++++++++ docker/build-sglang-bnxt-mi325x.sbatch | 12 ++++ docker/build-sglang-bnxt-mi325x.sh | 85 ++++++++++++++++++++++++++ 3 files changed, 175 insertions(+) create mode 100644 docker/README-mi325x.md create mode 100644 docker/build-sglang-bnxt-mi325x.sbatch create mode 100755 docker/build-sglang-bnxt-mi325x.sh diff --git a/docker/README-mi325x.md b/docker/README-mi325x.md new file mode 100644 index 000000000..109e512e3 --- /dev/null +++ b/docker/README-mi325x.md @@ -0,0 +1,78 @@ +# MI325X Container Image Build + +## Image: `semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori` + +SGLang v0.5.10 container for AMD Instinct MI325X/MI300X (gfx942 CDNA3) with: +- Broadcom Thor 2 RDMA support (bnxt_rocelib for RoCEv2 IBGDA) +- MoRI disaggregated inference (KV cache transfer) +- Qwen3.5 MoE (`qwen3_5_moe`), GLM-5 (`glm_moe_dsa`), DeepSeek-R1 model support +- AITER optimized kernels, TileLang NSA backends + +## Prerequisites + +1. **Broadcom BCM driver**: Download `bcm5760x_231.2.63.0a.zip` from [Broadcom support portal](https://www.broadcom.com/support) and place in `docker/` directory. + +2. **Docker**: Must build on a node with Docker and GPU access. Use the sbatch script on the MI325X cluster. + +3. **Docker Hub access**: Push credentials for `semianalysiswork` org. PAT is in `/nfsdata/sa/.j9s/InferenceX/.env.local` as `DOCKER_HUB_PAT`, login user `clustermax`. + +## Build + +```bash +# Option 1: Direct build (on a node with Docker) +cd docker/ +bash build-sglang-bnxt-mi325x.sh + +# Option 2: Submit as Slurm job +cd docker/ +sbatch build-sglang-bnxt-mi325x.sbatch +``` + +## Build process + +The script: +1. Clones [JordanNanos/sglang](https://github.com/JordanNanos/sglang) which contains the ROCm Dockerfile with bnxt patches +2. Copies the BCM driver into the build context +3. Builds with `SGL_BRANCH=v0.5.10`, `GPU_ARCH=gfx942`, `ENABLE_MORI=1`, `NIC_BACKEND=ibgda` +4. Pushes to `docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori` + +### What the Dockerfile builds + +- **Base**: `rocm/sgl-dev:rocm7-vllm-20250904` (ROCm 7.0 for gfx942) +- **AITER**: v0.1.10.post3 (AMD optimized kernels) +- **TileLang**: ML compiler for NSA backends (GLM-5) +- **Mooncake**: Distributed training framework +- **SGLang**: v0.5.10 (inference runtime) +- **MoRI**: AMD MoRI networking with bnxt_rocelib for Broadcom Thor 2 IBGDA +- **Broadcom bnxt_rocelib**: Compiled from BCM driver package + +### Build args reference + +| Arg | Default | Description | +|-----|---------|-------------| +| `SGL_BRANCH` | `v0.5.9` | SGLang git ref to build | +| `GPU_ARCH` | `gfx950` | GPU arch: `gfx942` (MI300X/MI325X) or `gfx950` (MI355X) | +| `ENABLE_MORI` | `0` | Set to `1` to build MoRI networking | +| `NIC_BACKEND` | `none` | `ainic` (Pensando), `ibgda` (Broadcom), or `none` | +| `BCM_DRIVER` | `bcm5760x_231.2.63.0a.zip` | BCM driver filename in build context | + +## Usage in InferenceX configs + +```yaml +# .github/configs/amd-master.yaml +dsr1-fp8-mi325x-sglang-disagg: + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori + ... +``` + +## Compatibility + +- **MI325X** (gfx942, CDNA3, Broadcom Thor 2 NICs) — primary target +- **MI300X** (gfx942, CDNA3) — same architecture, works if NICs are compatible +- **MI355X** (gfx950, CDNA4) — NOT compatible, use upstream `rocm/sgl-dev` images + +## Known issues + +- EP8/DP with `--moe-a2a-backend mori` hangs on bnxt_re — use default a2a kernels (see sgl-project/sglang#22072) +- RDMA SQ overflow at high concurrency with EP8 — cap `MORI_IO_QP_MAX_SEND_WR=4096` +- Non-MLA models (Qwen3.5, GLM-5) need matched TP sizes between prefill and decode (see sgl-project/sglang#15674) diff --git a/docker/build-sglang-bnxt-mi325x.sbatch b/docker/build-sglang-bnxt-mi325x.sbatch new file mode 100644 index 000000000..6e051c71f --- /dev/null +++ b/docker/build-sglang-bnxt-mi325x.sbatch @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --job-name=build-sgl-bnxt +#SBATCH --partition=compute +#SBATCH --nodes=1 +#SBATCH --gres=gpu:1 +#SBATCH --time=360 +#SBATCH --output=%u-build-mi325x-%j.log +#SBATCH --error=%u-build-mi325x-%j.log +#SBATCH --chdir=/tmp + +set -euo pipefail +bash "${SLURM_SUBMIT_DIR}/build-sglang-bnxt-mi325x.sh" diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh new file mode 100755 index 000000000..be6efc71e --- /dev/null +++ b/docker/build-sglang-bnxt-mi325x.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Build SGLang container image for MI325X with Broadcom bnxt_re RDMA support. +# +# Prerequisites: +# - Docker installed and running on a compute node with GPU access +# - Broadcom BCM driver archive placed in docker/ directory (see BCM_DRIVER below) +# - Docker Hub credentials: login as 'clustermax' to semianalysiswork org +# (PAT in /nfsdata/sa/.j9s/InferenceX/.env.local as DOCKER_HUB_PAT) +# +# This image supports: +# - AMD Instinct MI325X (gfx942 CDNA3) — also works on MI300X (same arch) +# - SGLang v0.5.10 with Qwen3.5 MoE, GLM-5 MoE, DeepSeek-R1 model support +# - MoRI disaggregated inference with Broadcom Thor 2 IBGDA/RoCEv2 +# - AITER optimized kernels, TileLang NSA backends +# +# Usage: +# cd /path/to/InferenceX/docker +# bash build-sglang-bnxt-mi325x.sh +# +# The image is pushed to: +# docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori +# +# Build reference: https://github.com/JordanNanos/sglang/tree/main/docker + +set -euo pipefail + +# ---------- Configuration ---------- +SGL_BRANCH="v0.5.10" +GPU_ARCH="gfx942" +IMAGE_TAG="semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori" +DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git" +DOCKERFILE_REF="main" + +# Broadcom BCM driver — must be placed in the build context directory. +# Download from Broadcom support portal (requires account). +BCM_DRIVER="bcm5760x_231.2.63.0a.zip" + +# ---------- Clone build repo ---------- +WORK_DIR=$(mktemp -d) +echo "[build] Cloning ${DOCKERFILE_REPO} (ref: ${DOCKERFILE_REF}) into ${WORK_DIR}" +git clone --depth 1 --branch "${DOCKERFILE_REF}" "${DOCKERFILE_REPO}" "${WORK_DIR}/sglang" + +# ---------- Copy BCM driver into build context ---------- +BUILD_CONTEXT="${WORK_DIR}/sglang/docker" +if [[ -f "${BCM_DRIVER}" ]]; then + cp "${BCM_DRIVER}" "${BUILD_CONTEXT}/" + echo "[build] BCM driver copied: ${BCM_DRIVER}" +elif [[ -f "/root/cache/${BCM_DRIVER}" ]]; then + cp "/root/cache/${BCM_DRIVER}" "${BUILD_CONTEXT}/" + echo "[build] BCM driver copied from /root/cache/" +else + echo "ERROR: BCM driver not found: ${BCM_DRIVER}" + echo "Place it in the current directory or /root/cache/" + exit 1 +fi + +# ---------- Docker login ---------- +if [[ -f /nfsdata/sa/.j9s/InferenceX/.env.local ]]; then + source /nfsdata/sa/.j9s/InferenceX/.env.local + echo "${DOCKER_HUB_PAT}" | docker login -u clustermax --password-stdin +fi + +# ---------- Build ---------- +echo "[build] Building ${IMAGE_TAG}" +echo "[build] SGL_BRANCH=${SGL_BRANCH}" +echo "[build] GPU_ARCH=${GPU_ARCH}" +echo "[build] ENABLE_MORI=1, NIC_BACKEND=ibgda" + +docker build \ + --build-arg SGL_BRANCH="${SGL_BRANCH}" \ + --build-arg GPU_ARCH="${GPU_ARCH}" \ + --build-arg ENABLE_MORI=1 \ + --build-arg NIC_BACKEND=ibgda \ + --build-arg BCM_DRIVER="${BCM_DRIVER}" \ + -t "${IMAGE_TAG}" \ + -f "${BUILD_CONTEXT}/rocm.Dockerfile" \ + "${BUILD_CONTEXT}/" + +# ---------- Push ---------- +echo "[build] Pushing ${IMAGE_TAG}" +docker push "${IMAGE_TAG}" + +# ---------- Cleanup ---------- +rm -rf "${WORK_DIR}" +echo "[build] Done: ${IMAGE_TAG}" From 18f1c5cecd9fc07e37341c01eec272d9a0481a99 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 18:28:07 +0000 Subject: [PATCH 34/41] Use latest SGLang main for MI325X image build Default SGL_BRANCH=main to get full model type support (qwen3_5_moe, glm_moe_dsa, etc). v0.5.10 may not have GLM-5 glm_moe_dsa support. Both SGL_BRANCH and IMAGE_TAG are now overridable via env vars. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/README-mi325x.md | 10 ++++++---- docker/build-sglang-bnxt-mi325x.sh | 11 +++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docker/README-mi325x.md b/docker/README-mi325x.md index 109e512e3..cec903b9c 100644 --- a/docker/README-mi325x.md +++ b/docker/README-mi325x.md @@ -1,8 +1,8 @@ # MI325X Container Image Build -## Image: `semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori` +## Image: `semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori` -SGLang v0.5.10 container for AMD Instinct MI325X/MI300X (gfx942 CDNA3) with: +SGLang (latest main) container for AMD Instinct MI325X/MI300X (gfx942 CDNA3) with: - Broadcom Thor 2 RDMA support (bnxt_rocelib for RoCEv2 IBGDA) - MoRI disaggregated inference (KV cache transfer) - Qwen3.5 MoE (`qwen3_5_moe`), GLM-5 (`glm_moe_dsa`), DeepSeek-R1 model support @@ -33,8 +33,10 @@ sbatch build-sglang-bnxt-mi325x.sbatch The script: 1. Clones [JordanNanos/sglang](https://github.com/JordanNanos/sglang) which contains the ROCm Dockerfile with bnxt patches 2. Copies the BCM driver into the build context -3. Builds with `SGL_BRANCH=v0.5.10`, `GPU_ARCH=gfx942`, `ENABLE_MORI=1`, `NIC_BACKEND=ibgda` -4. Pushes to `docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori` +3. Builds with `SGL_BRANCH=main` (latest, supports all model types), `GPU_ARCH=gfx942`, `ENABLE_MORI=1`, `NIC_BACKEND=ibgda` +4. Pushes to `docker.io/semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori` + +Override defaults: `SGL_BRANCH=v0.5.10 IMAGE_TAG=semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori bash build-sglang-bnxt-mi325x.sh` ### What the Dockerfile builds diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh index be6efc71e..8dcaa274f 100755 --- a/docker/build-sglang-bnxt-mi325x.sh +++ b/docker/build-sglang-bnxt-mi325x.sh @@ -9,7 +9,7 @@ # # This image supports: # - AMD Instinct MI325X (gfx942 CDNA3) — also works on MI300X (same arch) -# - SGLang v0.5.10 with Qwen3.5 MoE, GLM-5 MoE, DeepSeek-R1 model support +# - SGLang (latest main) with Qwen3.5 MoE, GLM-5 MoE, DeepSeek-R1 model support # - MoRI disaggregated inference with Broadcom Thor 2 IBGDA/RoCEv2 # - AITER optimized kernels, TileLang NSA backends # @@ -18,16 +18,19 @@ # bash build-sglang-bnxt-mi325x.sh # # The image is pushed to: -# docker.io/semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori +# docker.io/semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori +# +# Override defaults with env vars: +# SGL_BRANCH=v0.5.10 IMAGE_TAG=semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori bash build-sglang-bnxt-mi325x.sh # # Build reference: https://github.com/JordanNanos/sglang/tree/main/docker set -euo pipefail # ---------- Configuration ---------- -SGL_BRANCH="v0.5.10" +SGL_BRANCH="${SGL_BRANCH:-main}" GPU_ARCH="gfx942" -IMAGE_TAG="semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-mori" +IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori}" DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git" DOCKERFILE_REF="main" From 13be2f6324ad596f677224a51fa81536699883a0 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 18:31:35 +0000 Subject: [PATCH 35/41] Update build script default to SGL_BRANCH=v0.5.10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rebuilds semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt with proper model type registration for qwen3_5_moe and glm_moe_dsa. BCM driver (bcm5760x_231.2.63.0a.zip) is at /nfsdata/sa/.j9s/ on the cluster — too large for git (192MB). Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/build-sglang-bnxt-mi325x.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh index 8dcaa274f..22e5f923c 100755 --- a/docker/build-sglang-bnxt-mi325x.sh +++ b/docker/build-sglang-bnxt-mi325x.sh @@ -28,9 +28,9 @@ set -euo pipefail # ---------- Configuration ---------- -SGL_BRANCH="${SGL_BRANCH:-main}" +SGL_BRANCH="${SGL_BRANCH:-v0.5.10}" GPU_ARCH="gfx942" -IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:latest-bnxt-mori}" +IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt}" DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git" DOCKERFILE_REF="main" From 9ec6e9daa39587d33be5e4bb9497e816a9a3f0cf Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 19:32:34 +0000 Subject: [PATCH 36/41] Add transformers patch layer for GLM-5/Qwen3.5 model type support SGLang v0.5.10 pins an older transformers that doesn't have glm_moe_dsa or qwen3_5_moe in its AutoConfig registry. Add a thin Dockerfile layer that upgrades transformers to the commit with these model types and verifies registration. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/build-sglang-bnxt-mi325x.sh | 11 +++++++++++ docker/patch-transformers.Dockerfile | 10 ++++++++++ 2 files changed, 21 insertions(+) create mode 100644 docker/patch-transformers.Dockerfile diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh index 22e5f923c..d23740c6e 100755 --- a/docker/build-sglang-bnxt-mi325x.sh +++ b/docker/build-sglang-bnxt-mi325x.sh @@ -79,6 +79,17 @@ docker build \ -f "${BUILD_CONTEXT}/rocm.Dockerfile" \ "${BUILD_CONTEXT}/" +# ---------- Patch transformers for GLM-5/Qwen3.5 model type support ---------- +PATCH_DOCKERFILE="$(dirname "$0")/patch-transformers.Dockerfile" +if [[ -f "${PATCH_DOCKERFILE}" ]]; then + echo "[build] Patching transformers for glm_moe_dsa/qwen3_5_moe support" + docker build \ + --build-arg BASE_IMAGE="${IMAGE_TAG}" \ + -t "${IMAGE_TAG}" \ + -f "${PATCH_DOCKERFILE}" \ + "$(dirname "$0")/" +fi + # ---------- Push ---------- echo "[build] Pushing ${IMAGE_TAG}" docker push "${IMAGE_TAG}" diff --git a/docker/patch-transformers.Dockerfile b/docker/patch-transformers.Dockerfile new file mode 100644 index 000000000..f89c44c25 --- /dev/null +++ b/docker/patch-transformers.Dockerfile @@ -0,0 +1,10 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Upgrade transformers to get glm_moe_dsa and qwen3_5_moe model type support. +# The SGLang v0.5.10 base image pins an older transformers that doesn't have these. +RUN pip install --no-cache-dir -U \ + "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" \ + && python3 -c "import transformers; print(f'transformers {transformers.__version__}')" \ + && python3 -c "from transformers import AutoConfig; AutoConfig.for_model('glm_moe_dsa')" \ + && echo "glm_moe_dsa model type verified" From 02645c76c3c915b81e89366e20312b806a6081cc Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 22:02:49 +0000 Subject: [PATCH 37/41] Build from SGLang main for Qwen3.5/GLM-5 PD disagg fixes v0.5.10 is missing merged PRs #19002, #19076, #19086 that fix non-MLA PD disaggregation (GQA head count, KV cache slice transfer, mamba cache size). Building from main includes all these fixes. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/build-sglang-bnxt-mi325x.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/build-sglang-bnxt-mi325x.sh b/docker/build-sglang-bnxt-mi325x.sh index d23740c6e..20de93517 100755 --- a/docker/build-sglang-bnxt-mi325x.sh +++ b/docker/build-sglang-bnxt-mi325x.sh @@ -28,9 +28,9 @@ set -euo pipefail # ---------- Configuration ---------- -SGL_BRANCH="${SGL_BRANCH:-v0.5.10}" +SGL_BRANCH="${SGL_BRANCH:-main}" GPU_ARCH="gfx942" -IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt}" +IMAGE_TAG="${IMAGE_TAG:-semianalysiswork/sgl-bnxt-cdna3:main-bnxt}" DOCKERFILE_REPO="https://github.com/JordanNanos/sglang.git" DOCKERFILE_REF="main" From 947e33995f9b00b4e3a46d37e3b093dbbbcc6c71 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Fri, 10 Apr 2026 22:53:52 +0000 Subject: [PATCH 38/41] Switch Qwen3.5/GLM-5 to main-bnxt image with PD disagg fixes Image semianalysiswork/sgl-bnxt-cdna3:main-bnxt built from SGLang main which includes PRs #19002, #19076, #19086 fixing non-MLA PD disaggregation (GQA head count, KV cache transfer, mamba cache). Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 52d7c533c..a146b0ea4 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi325x-sglang: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x @@ -1619,7 +1619,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=3" qwen3.5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi325x-disagg @@ -1802,7 +1802,7 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" glm5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x-disagg From d64877420c4da34ce8fda9b7b0b0de2a94fec9ef Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Sat, 11 Apr 2026 00:57:22 +0000 Subject: [PATCH 39/41] Switch to v0.5.10-bnxt-patched (PD fixes + transformers patch) SGLang main broke the disagg router for all models. Use v0.5.10 (has PD fixes #19086 merged) + transformers patch for model type recognition. v0.5.10 has working router + PD KV cache fixes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a146b0ea4..9083dddf3 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi325x-sglang: - image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x @@ -1619,7 +1619,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=3" qwen3.5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi325x-disagg @@ -1802,7 +1802,7 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" glm5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:main-bnxt + image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x-disagg From d6053e1c9c5d454cd966ff505ca5e678c762e430 Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Sat, 11 Apr 2026 03:42:02 +0000 Subject: [PATCH 40/41] Add thin bnxt layer Dockerfile for existing SGLang images add-bnxt.Dockerfile adds Broadcom bnxt_rocelib + transformers patch on top of any SGLang ROCm base image. Faster than full rebuild when the base image already has the right SGLang version. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker/add-bnxt.Dockerfile | 48 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 docker/add-bnxt.Dockerfile diff --git a/docker/add-bnxt.Dockerfile b/docker/add-bnxt.Dockerfile new file mode 100644 index 000000000..9bb82eb58 --- /dev/null +++ b/docker/add-bnxt.Dockerfile @@ -0,0 +1,48 @@ +# Thin layer that adds Broadcom bnxt_rocelib RDMA support to any SGLang ROCm image. +# Usage: +# docker build --build-arg BASE_IMAGE=lmsysorg/sglang:v0.5.9-rocm700-mi30x \ +# -t semianalysiswork/sgl-bnxt-cdna3:v0.5.9-bnxt-lite \ +# -f add-bnxt.Dockerfile . + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Install RDMA build dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libibumad-dev rdma-core ibverbs-utils infiniband-diags \ + gcc make libtool autoconf librdmacm-dev rdmacm-utils \ + perftest ethtool libibverbs-dev strace \ + && rm -rf /var/lib/apt/lists/* + +# Install Broadcom bnxt_rocelib +ARG BCM_DRIVER=bcm5760x_231.2.63.0a.zip +COPY ${BCM_DRIVER} /tmp/${BCM_DRIVER} +RUN cd /tmp && \ + case "${BCM_DRIVER}" in \ + *.zip) apt-get update && apt-get install -y unzip && unzip -o ./${BCM_DRIVER} ;; \ + *.tar.gz) tar zxf ./${BCM_DRIVER} ;; \ + *) echo "ERROR: unsupported archive: ${BCM_DRIVER}" && exit 1 ;; \ + esac && \ + DIR_NAME="${BCM_DRIVER%.*}" && \ + # Handle double extension (.tar.gz) + case "${BCM_DRIVER}" in *.tar.gz) DIR_NAME="${BCM_DRIVER%.tar.gz}" ;; esac && \ + cd /tmp/${DIR_NAME}/drivers_linux/bnxt_rocelib && \ + BCM_LIB=$(ls -1 *.tar.gz) && \ + tar zxf ${BCM_LIB} && \ + cd "${BCM_LIB%.tar.gz}" && \ + sh ./autogen.sh && \ + sh ./configure && \ + make -j8 && \ + # Backup inbox drivers and install + find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; 2>/dev/null || true && \ + make install && \ + echo /usr/local/lib >> /etc/ld.so.conf && \ + ldconfig && \ + # Cleanup + rm -rf /tmp/${BCM_DRIVER} /tmp/${DIR_NAME} && \ + echo "bnxt_rocelib installed successfully" + +# Upgrade transformers for glm_moe_dsa and qwen3_5_moe model type support +RUN pip install --no-cache-dir -U \ + "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" \ + && python3 -c "import transformers; print(f'transformers {transformers.__version__}')" From 757d01536e6a0e38f90b1fb336cf012ea4326cda Mon Sep 17 00:00:00 2001 From: JordanNanos Date: Sat, 11 Apr 2026 03:45:34 +0000 Subject: [PATCH 41/41] Switch Qwen3.5/GLM-5 to amd-disagg-bnxt-lite image Built from rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 (AMD's PD disagg image) + bnxt_rocelib + transformers patch. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9083dddf3..c2b55425f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi325x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi325x-sglang: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched + image: semianalysiswork/sgl-bnxt-cdna3:amd-disagg-bnxt-lite model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x @@ -1619,7 +1619,7 @@ dsr1-fp8-mi325x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=3" qwen3.5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched + image: semianalysiswork/sgl-bnxt-cdna3:amd-disagg-bnxt-lite model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi325x-disagg @@ -1802,7 +1802,7 @@ qwen3.5-fp8-mi325x-sglang-disagg: - "MODEL_YAML_KEY=Qwen3.5-397B-A17B-FP8-no-mori" glm5-fp8-mi325x-sglang-disagg: - image: semianalysiswork/sgl-bnxt-cdna3:v0.5.10-bnxt-patched + image: semianalysiswork/sgl-bnxt-cdna3:amd-disagg-bnxt-lite model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi325x-disagg