Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
6e39343
feat: MI300X disaggregated inference with Broadcom IBGDA (issue #982)
JordanNanos Apr 2, 2026
4071ffe
fix: address PR review comments
JordanNanos Apr 2, 2026
fba085b
fix: resolve HF cache snapshot path on compute node in job.slurm
JordanNanos Apr 3, 2026
65e8f99
fix: use glob to resolve HF cache dir with unknown org prefix
JordanNanos Apr 3, 2026
d407492
Merge remote-tracking branch 'origin/main' into jordan/mi300x-disagg-…
JordanNanos Apr 3, 2026
8cacb31
fix: move MI300X/MI325X disagg image to semianalysiswork Docker Hub
JordanNanos Apr 3, 2026
27fab49
fix: replace srun-based IP/model checks with scontrol in job.slurm
JordanNanos Apr 4, 2026
a4e56c3
ci: add 2-node-only disagg config keys for MI300X
JordanNanos Apr 5, 2026
fa21716
fix: submit sbatch as root so srun steps work on worker nodes
Apr 5, 2026
3683c13
fix: use squeue -j JOB_ID instead of squeue -u USER for job polling
Apr 5, 2026
f9b16f0
fix: use sudo -E to preserve environment when submitting sbatch as root
Apr 5, 2026
c1ca311
fix: sanitize MODEL_NAME in Docker container name
Apr 5, 2026
27a0836
fix: add timeout to NFS refresh srun step to prevent infinite hang
Apr 5, 2026
5626990
fix: use timeout wrapper instead of invalid srun --timeout flag
Apr 5, 2026
46a147c
fix: increase prefill TP from 4 to 8 in 2-node configs for R1-0528
Apr 5, 2026
691151d
fix: fall back to /vfs/models_blog when HF cache blobs are missing
Apr 6, 2026
7fa054e
fix: auto-configure UFW to allow cross-node external IP traffic
Apr 6, 2026
300ca05
fix: bind container barrier server to 0.0.0.0 for cross-subnet access
Apr 6, 2026
5a8ffec
fix: handle zero TPOT in process_result.py to avoid ZeroDivisionError
Apr 6, 2026
1b936c5
fix: address review comments on process_result.py and job.slurm
Apr 8, 2026
0ab1fcd
Merge remote-tracking branch 'origin/main' into jordan/mi300x-disagg-…
Apr 8, 2026
21f5eef
ci: re-trigger sweep after UFW fix on node 049
Apr 8, 2026
1237dac
ci: re-run sweep for full pareto coverage
Apr 8, 2026
7c3275d
Merge remote-tracking branch 'origin/main' into jordan/mi300x-disagg-…
Apr 8, 2026
202beb8
feat: add MI300X single-node benchmarks for GLM-5, Kimi K2.5, MiniMax…
Apr 9, 2026
3f86f97
fix: use semianalysiswork sgl-bnxt-cdna3 image for all MI300X SGLang …
Apr 9, 2026
1b12c42
feat: add GLM-5 FP8 MI300X single-node benchmark script
Apr 9, 2026
ccb4ee4
fix: use sglang v0.5.10 image for GLM-5 MI300X
Apr 9, 2026
0c75ef1
feat: add Qwen 3.5 and GLM-5 disaggregated inference on MI300X
Apr 9, 2026
59e3cf0
feat: build and publish sgl-bnxt-cdna3:v0.5.10-bnxt for GLM-5 disagg
Apr 10, 2026
d8419ee
fix: add --ep-dispatch-algorithm fake to Qwen/GLM-5 disagg configs
Apr 10, 2026
11cd247
fix: use v0.5.10-bnxt for Qwen disagg — v0.5.9 produces 0 successful …
Apr 10, 2026
6de3133
fix: reduce memory config for Qwen/GLM-5 disagg on MI300X (192GB HBM3)
Apr 10, 2026
17b20ba
fix: disable CUDA graphs and reduce memory further for Qwen/GLM-5 disagg
Apr 10, 2026
0acc23e
fix: add librocm_smi64.so.7 compat symlink in v0.5.10-bnxt image
Apr 11, 2026
1228171
fix: reduce decode mem_fraction to 0.5 for Qwen/GLM-5 disagg on MI300X
Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
514 changes: 513 additions & 1 deletion .github/configs/amd-master.yaml

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ mi300x:
- 'mi300x-amds_1'
- 'mi300x-amds_2'
- 'mi300x-amds_3'
mi300x-disagg:
- 'mi300x-amds_0'
- 'mi300x-amds_2'
- 'mi300x-amds_3'
mi325x:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
Expand Down
23 changes: 23 additions & 0 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ if [[ -z "$IBDEVICES" ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
elif [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI300X cluster: Broadcom RoCE (bnxt_re); all 8 devices present
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
else
echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
exit 1
Expand All @@ -42,6 +48,13 @@ export SGLANG_USE_AITER=1
export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200

# GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading
if [[ "$MODEL_NAME" == "GLM-5-FP8" ]]; then
export SGLANG_ROCM_FUSED_DECODE_MLA=0
export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
export SAFETENSORS_FAST_GPU=1
fi

# Disable allocating memory in one pass
export MORI_SHMEM_MODE=ISOLATION
export SGLANG_MORI_FP8_DISP=True
Expand Down Expand Up @@ -101,6 +114,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
fi
Expand All @@ -114,6 +132,11 @@ else
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
echo " This is normal for clusters without QoS or outside Docker containers."
Expand Down
151 changes: 114 additions & 37 deletions benchmarks/multi_node/amd_utils/job.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
exit 1
fi

# Validate MODEL_NAME exists as a top-level key in models.yaml
if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
echo "Error: Model '$MODEL_NAME' not found in models.yaml"
# MODEL_YAML_KEY: top-level key in models.yaml for server config lookup.
# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"

# Validate the yaml key exists as a top-level key in models.yaml
if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
echo "Available models:"
grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /'
exit 1
fi
echo "Model found: $MODEL_NAME"
echo "Model found: $_MODEL_YAML_KEY"

# All models use server.sh as the entrypoint
RUN_FILE="server.sh"
Expand All @@ -63,6 +67,7 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP
ENABLE_DISAGG_DECODE_PARALLELISM_FLAGS="${ENABLE_DISAGG_DECODE_PARALLELISM_FLAGS:-false}"

# Benchmark Configuration with defaults
BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
Expand Down Expand Up @@ -104,35 +109,41 @@ TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
echo "Total allocated nodes: $TOTAL_NODES"
echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"

# Function to check model path on all nodes
# Function to check model path — checks on the batch node (NFS path is shared across all nodes).
check_model_path() {
local path=$1
local check_name=$2

echo "Checking $check_name: $path"

# Run check on all nodes in parallel
srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
if [ -d '$path' ]; then
echo \"\$(hostname): ✓ Found $path\"
exit 0
else
echo \"\$(hostname): ✗ Missing $path\"
exit 1
fi
"

# Check if all nodes succeeded (exit code 0)
local exit_code=$?
if [ $exit_code -eq 0 ]; then
if [ -d "$path" ]; then
echo "$(hostname): ✓ Found $path"
echo "✓ $check_name available on ALL nodes"
return 0
else
echo "$(hostname): ✗ Missing $path"
echo "✗ $check_name NOT available on all nodes"
return 1
fi
}

# If MODEL_NAME is a plain name (not already a HF cache path), try to resolve
# the HF hub cache layout on this node: models--{org}--{repo}/snapshots/<hash>
# This handles clusters where the cache is node-local and can't be resolved
# from the job launcher (which may run on a different host).
if [[ "$MODEL_NAME" != models--* ]] && [[ "$MODEL_NAME" != *snapshots* ]]; then
# Glob for any HF cache dir ending in --${MODEL_NAME} (handles unknown org prefix)
_HF_DIR_GLOB=$(ls -d "${MODEL_DIR}/models--"*"--${MODEL_NAME}" 2>/dev/null | head -1)
if [[ -n "${_HF_DIR_GLOB}" ]]; then
_HF_DIR=$(basename "${_HF_DIR_GLOB}")
_SNAPSHOT=$(ls "${MODEL_DIR}/${_HF_DIR}/snapshots/" 2>/dev/null | sort | tail -1)
if [[ -n "${_SNAPSHOT}" ]]; then
MODEL_NAME="${_HF_DIR}/snapshots/${_SNAPSHOT}"
echo "Resolved MODEL_NAME from local HF cache: ${MODEL_NAME}"
fi
fi
fi

# Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
Expand All @@ -151,6 +162,29 @@ fi
echo "Final MODEL_PATH: $MODEL_PATH"
echo ""

# Verify model weights are actually readable (not just symlinks to missing blobs).
# The HF cache may have snapshot symlinks pointing to blob files that no longer exist.
_MODEL_WEIGHT_SAMPLE=$(ls "$MODEL_PATH"/model-00001-of-*.safetensors 2>/dev/null | head -1)
if [[ -n "$_MODEL_WEIGHT_SAMPLE" ]] && ! head -c 1 "$_MODEL_WEIGHT_SAMPLE" > /dev/null 2>&1; then
echo "[WARN] Model weights at $MODEL_PATH have broken symlinks (blobs missing)"
echo "[INFO] Searching /vfs/models_blog/ for a matching model..."
# Strip trailing date suffix (e.g. -0528, -1105) for fuzzy directory match
_VFS_KEY=$(echo "$MODEL_YAML_KEY" | sed 's/-[0-9]\{4\}$//')
_VFS_MODEL=$(ls -d /vfs/models_blog/*"${_VFS_KEY}"* 2>/dev/null | head -1)
if [[ -n "$_VFS_MODEL" ]] && [[ -d "$_VFS_MODEL" ]]; then
if ls "$_VFS_MODEL"/model-00001-of-*.safetensors &>/dev/null 2>&1; then
MODEL_PATH="$_VFS_MODEL"
MODEL_DIR="/vfs/models_blog"
MODEL_NAME=$(basename "$_VFS_MODEL")
echo "[INFO] /vfs fallback: MODEL_PATH=$MODEL_PATH, MODEL_NAME=$MODEL_NAME"
else
echo "[WARN] /vfs fallback also has no weights at $_VFS_MODEL"
fi
else
echo "[WARN] No matching model found in /vfs/models_blog/ for key '$_VFS_KEY'"
fi
fi

NUM_NODES="${NUM_NODES}"

# ------------------------
Expand Down Expand Up @@ -223,15 +257,26 @@ echo ""
# Node information
USER_NAME=$(whoami)
MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1)
NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1')
NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}')

IPS=()
# Use scontrol to get node IPs — avoids srun timeouts when nodes are temporarily slow.
get_node_ip() {
local node=$1
scontrol show node "$node" 2>/dev/null | awk '/NodeAddr=/ {match($0, /NodeAddr=([^ ]+)/, a); print a[1]}'
}

NODE0_ADDR=$(get_node_ip "$MASTER_NODE")
if [[ -z "$NODE0_ADDR" ]]; then
echo "ERROR: Could not resolve IP for master node $MASTER_NODE via scontrol" >&2
exit 1
fi

GW_NIC=$(ip route | awk '/^default/ {print $5; exit}')
IPS=()
for NODE in $SELECTED_NODES; do
IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1')
IP=$(echo "$IP" | awk '/src/ {print $7}')
IP=$(get_node_ip "$NODE")
if [[ -z "$IP" ]]; then
echo "ERROR: Could not resolve IP for node $NODE via scontrol" >&2
exit 1
fi
IPS+=("$IP")
done

Expand All @@ -249,10 +294,9 @@ echo "NNODES is ${NNODES}"
echo "REPO Directory is ${DI_REPO_DIR}"
echo "USER_NAME is ${USER_NAME}"

# Get the RDMA priority and DSCP value from the NIC
# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
if ! command -v nicctl >/dev/null 2>&1; then
echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
exit 1
echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
fi

# Reduce log spam
Expand All @@ -276,6 +320,7 @@ export DECODE_TP_SIZE=$DECODE_TP_SIZE
export DECODE_ENABLE_EP=$DECODE_ENABLE_EP
export DECODE_ENABLE_DP=$DECODE_ENABLE_DP
export DECODE_MTP_SIZE=$DECODE_MTP_SIZE
export ENABLE_DISAGG_DECODE_PARALLELISM_FLAGS=$ENABLE_DISAGG_DECODE_PARALLELISM_FLAGS
export GPUS_PER_NODE=$GPUS_PER_NODE
export BENCH_INPUT_LEN=$BENCH_INPUT_LEN
export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN
Expand All @@ -286,7 +331,8 @@ export DRY_RUN="${DRY_RUN:-0}"
export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"

SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
SANITIZED_MODEL=$(echo "$MODEL_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${SANITIZED_MODEL}_${SLURM_JOB_ID}"
export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"


Expand All @@ -296,8 +342,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)

cleanup() {
echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
# clean up the logs folder
sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
# NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true

echo "[${SLURM_JOB_ID}] cleanup done."
}
Expand All @@ -307,16 +353,45 @@ trap cleanup INT TERM HUP

# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors
echo "Refreshing NFS caches on all nodes..."
srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
timeout 30 srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
sync
# Force re-stat of the mounted directory to refresh NFS handles
ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1
stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
timeout 10 ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1
timeout 10 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
timeout 10 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
# Drop caches if we have permission (optional, requires root)
echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true
echo "NFS cache refreshed on $(hostname)"
'
' || echo "[WARN] NFS refresh srun failed (non-fatal); continuing."

# -----------------------------------------------------------------------
# Cross-node firewall setup: allow external IPs for SGLang bootstrap.
# The MoRI bootstrap server binds to the default NIC (external IP), but
# UFW may only allow internal subnet traffic. Collect each node's external
# IP and add UFW allow rules on all nodes before starting Docker.
# UFW rules persist across reboots, but we run this per-job so that newly
# added or re-imaged nodes automatically receive the correct peer rules.
# -----------------------------------------------------------------------
_UFW_DIR="${BENCHMARK_LOGS_DIR}/.ufw_${SLURM_JOB_ID}"
mkdir -p "$_UFW_DIR"
timeout 30 srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
_IFACE=$(ip route show default 2>/dev/null | awk "NR==1{print \$5}")
_EXT_IP=$(ip -4 addr show "$_IFACE" 2>/dev/null | awk "/inet /{sub(/\/.*$/, \"\", \$2); print \$2; exit}")
if [[ -n "$_EXT_IP" ]]; then
echo "$_EXT_IP" > '"$_UFW_DIR"'/$SLURM_PROCID.ip
echo "[INFO] $(hostname): external IP $_EXT_IP (rank $SLURM_PROCID)"
fi
' 2>/dev/null || echo "[WARN] External IP collection failed (non-fatal)."
for _IP_FILE in "$_UFW_DIR"/*.ip; do
[[ -f "$_IP_FILE" ]] || continue
_PEER_IP=$(cat "$_IP_FILE")
[[ -z "$_PEER_IP" ]] && continue
Comment on lines +375 to +388
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isnt this an 1 time thing that persistent between restarts? if it is an 1 time thing, maybe move it to an utils folder?

timeout 15 srun --nodelist="$SELECTED_NODELIST_SRUN" \
bash -c "ufw allow from '$_PEER_IP' 2>/dev/null || true; echo \"[INFO] UFW: allowed $_PEER_IP on \$(hostname)\"" \
2>/dev/null || true
done
rm -rf "$_UFW_DIR"
echo "Cross-node firewall setup complete."

srun \
--nodelist="$SELECTED_NODELIST_SRUN" \
Expand Down Expand Up @@ -357,7 +432,7 @@ exec sudo docker run --rm \
--privileged \
-v ${MODEL_DIR}:/models \
-v \$HOME/.ssh:/root/.ssh \
-v $(which nicctl):/usr/sbin/nicctl \
$(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
--shm-size 128G \
-v /tmp:/run_logs \
-v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
Expand All @@ -373,6 +448,7 @@ exec sudo docker run --rm \
-e xP=\$xP \
-e yD=\$yD \
-e MODEL_NAME=\$MODEL_NAME \
-e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
-e IPADDRS=\$IPADDRS \
-e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
-e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
Expand All @@ -381,6 +457,7 @@ exec sudo docker run --rm \
-e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \
-e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \
-e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \
-e ENABLE_DISAGG_DECODE_PARALLELISM_FLAGS=\$ENABLE_DISAGG_DECODE_PARALLELISM_FLAGS \
-e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \
-e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \
-e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \
Expand Down
62 changes: 62 additions & 0 deletions benchmarks/multi_node/amd_utils/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,68 @@ DeepSeek-R1-0528:
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"

Qwen3.5-397B-A17B-FP8:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --disable-cuda-graph"
mtp_flags: ""
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.6
disable_radix_cache: true
dp:
max_running_requests: 24
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
cuda_graph_bs: "1 2 3"
no_dp:
max_running_requests: 8
chunked_prefill_size: 65536
cuda_graph_bs_range: "1-8"
decode:
mem_fraction_static: 0.5
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-160"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 4
chunked_prefill_size: 32768
cuda_graph_bs_range: "1-4"

GLM-5-FP8:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\"enable_multithread_load\\\": true, \\\"num_threads\\\": 8}' --disable-cuda-graph"
mtp_flags: ""
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.65
disable_radix_cache: true
dp:
max_running_requests: 24
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
cuda_graph_bs: "1 2 3"
no_dp:
max_running_requests: 8
chunked_prefill_size: 65536
cuda_graph_bs_range: "1-8"
decode:
mem_fraction_static: 0.5
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-160"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 4
chunked_prefill_size: 32768
cuda_graph_bs_range: "1-8"

DeepSeek-R1-0528-MXFP4-Preview:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
Expand Down
Loading
Loading