Skip to content
Draft
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
834fe82
Add MI325X DeepSeek-R1 FP8 disaggregated inference (1P1D, Broadcom Th…
Mar 31, 2026
7b50476
Update amd-master.yaml
JordanNanos Mar 31, 2026
b40908c
Add MTP config, expand sweep to full pareto frontier, use -good image
Mar 31, 2026
2421ca5
Add perf-changelog entry for MI325X disagg configs
Mar 31, 2026
6abdf85
Fix MI325X QoS detection and NFS-safe cleanup for disagg benchmarks
JordanNanos Apr 1, 2026
3716258
Add local NVMe model caching for faster model loading
JordanNanos Apr 1, 2026
db677bd
Switch model caching from rsync to rclone sync
JordanNanos Apr 1, 2026
0a485de
Add MTP baseline to single-node MI325X DeepSeek-R1 FP8 config
JordanNanos Apr 1, 2026
67dec7c
Split MI325X single-node MTP into separate config key
JordanNanos Apr 1, 2026
f18257f
Fix MI325X single-node script resolution and add MTP support
JordanNanos Apr 2, 2026
3ccfba3
Fix decode dispatch token limit for DP attention disagg configs
JordanNanos Apr 2, 2026
0213032
Disable EP8/DP disagg configs on MI325X and bump MTP to 3 tokens
JordanNanos Apr 2, 2026
2afb24a
Add single-node EP8/DP test configs for MI325X disagg
JordanNanos Apr 2, 2026
36aebfd
Move container image to semianalysiswork Docker Hub and fix launcher …
JordanNanos Apr 3, 2026
b5a0bc2
Test EP8/DP workaround: drop MoRI a2a backend on MI325X bnxt_re
JordanNanos Apr 4, 2026
beb3808
Fix MODEL_NAME for EP8/DP test configs with MODEL_YAML_KEY override
JordanNanos Apr 4, 2026
23c2931
fix: resolve MODEL_NAME from flat repo dir when HF snapshot absent
JordanNanos Apr 4, 2026
e5b9d00
Tune EP8/DP test: lower concurrency + QP params for SQ full fix
JordanNanos Apr 4, 2026
76d89d0
fix: lower bnxt_re QP limits and concurrency for MI325X EP8/DP disagg
JordanNanos Apr 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 319 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,24 @@ dsr1-fp8-mi325x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi325x-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }

dsr1-fp8-mi355x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -1231,3 +1249,304 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsr1-fp8-mi325x-sglang-disagg:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ur missing perfchange log . yaml too

image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JordanNanos check ur email for access to https://hub.docker.com/u/semianalysiswork

nit: can u move this to our official SemiAnalysisAI public docker hub

model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
- spec-decoding: "none"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
- spec-decoding: "none"
conc-list: [ 768, 512, 256 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

r u sure that TP4 is on the pareto here? do u have an graph?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

u only have TP4 curve and u have "hide non-optimal"? can u run the rest of the 24 datapoints?

- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

- isl: 8192
osl: 1024
search-space:
# "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
- spec-decoding: "none"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp8-mi325x-sglang-disagg-mtp:
image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt-good
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# MTP configurations
# "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
- spec-decoding: "mtp"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"

# "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
- spec-decoding: "mtp"
conc-list: [ 768, 512, 256 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=2"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how much mtp to do is really an tradeoff between compute & memory BW. more mtp == more compute. i.e. for top left high concurrency you def want less MTP.

for low concurrency (bottom right of the curve), you def want more MTP.

for low concurrency, the SOL should be MTP=3

tho it depends on how optimized the AMD kernels acutally are lol

can u try DECODE_MTP_SIZE=3, there is an reasonable chance that it is better @JordanNanos


- isl: 8192
osl: 1024
search-space:
# MTP configurations
# "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
- spec-decoding: "mtp"
conc-list: [ 512, 1024 ]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=2"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ mi325x:
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi325x-disagg:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi355x:
- 'mi355x-amds_0'
- 'mi355x-amds_1'
Expand Down
13 changes: 13 additions & 0 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
else
echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
exit 1
Expand Down Expand Up @@ -101,6 +104,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
fi
Expand All @@ -114,6 +122,11 @@ else
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
echo " This is normal for clusters without QoS or outside Docker containers."
Expand Down
Loading
Loading