Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
834fe82
Add MI325X DeepSeek-R1 FP8 disaggregated inference (1P1D, Broadcom Th…
Mar 31, 2026
7b50476
Update amd-master.yaml
JordanNanos Mar 31, 2026
b40908c
Add MTP config, expand sweep to full pareto frontier, use -good image
Mar 31, 2026
2421ca5
Add perf-changelog entry for MI325X disagg configs
Mar 31, 2026
6abdf85
Fix MI325X QoS detection and NFS-safe cleanup for disagg benchmarks
JordanNanos Apr 1, 2026
3716258
Add local NVMe model caching for faster model loading
JordanNanos Apr 1, 2026
db677bd
Switch model caching from rsync to rclone sync
JordanNanos Apr 1, 2026
0a485de
Add MTP baseline to single-node MI325X DeepSeek-R1 FP8 config
JordanNanos Apr 1, 2026
67dec7c
Split MI325X single-node MTP into separate config key
JordanNanos Apr 1, 2026
f18257f
Fix MI325X single-node script resolution and add MTP support
JordanNanos Apr 2, 2026
3ccfba3
Fix decode dispatch token limit for DP attention disagg configs
JordanNanos Apr 2, 2026
0213032
Disable EP8/DP disagg configs on MI325X and bump MTP to 3 tokens
JordanNanos Apr 2, 2026
2afb24a
Add single-node EP8/DP test configs for MI325X disagg
JordanNanos Apr 2, 2026
36aebfd
Move container image to semianalysiswork Docker Hub and fix launcher …
JordanNanos Apr 3, 2026
b5a0bc2
Test EP8/DP workaround: drop MoRI a2a backend on MI325X bnxt_re
JordanNanos Apr 4, 2026
beb3808
Fix MODEL_NAME for EP8/DP test configs with MODEL_YAML_KEY override
JordanNanos Apr 4, 2026
23c2931
fix: resolve MODEL_NAME from flat repo dir when HF snapshot absent
JordanNanos Apr 4, 2026
e5b9d00
Tune EP8/DP test: lower concurrency + QP params for SQ full fix
JordanNanos Apr 4, 2026
76d89d0
fix: lower bnxt_re QP limits and concurrency for MI325X EP8/DP disagg
JordanNanos Apr 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
372 changes: 372 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,24 @@ dsr1-fp8-mi325x-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi325x-sglang-mtp:
image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }

dsr1-fp8-mi355x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
model: deepseek-ai/DeepSeek-R1-0528
Expand Down Expand Up @@ -1231,3 +1249,357 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsr1-fp8-mi325x-sglang-disagg:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ur missing perfchange log . yaml too

image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
# - spec-decoding: "none"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=0"

# # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
# - spec-decoding: "none"
# conc-list: [ 768, 512, 256 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"

# Single-node EP8/DP decode — workaround: use default a2a kernels instead of
# MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072
# Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8.
- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
- "MODEL_NAME=DeepSeek-R1-0528"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"
- "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
- "MODEL_NAME=DeepSeek-R1-0528"

- isl: 8192
osl: 1024
search-space:
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
# - spec-decoding: "none"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "PREFILL_NODES=2"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=1"
# - "DECODE_MTP_SIZE=0"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "none"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

r u sure that TP4 is on the pareto here? do u have an graph?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

u only have TP4 curve and u have "hide non-optimal"? can u run the rest of the 24 datapoints?

- spec-decoding: "none"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp8-mi325x-sglang-disagg-mtp:
image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x-disagg
precision: fp8
framework: sglang-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# MTP configurations
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
# - spec-decoding: "mtp"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=1"

# # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
# - spec-decoding: "mtp"
# conc-list: [ 768, 512, 256 ]
# prefill:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# additional-settings:
# - "PREFILL_NODES=1"
# decode:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=2"
# - "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"

# Single-node EP8/DP decode with MTP — workaround: default a2a kernels
# Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8.
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
- "MODEL_NAME=DeepSeek-R1-0528"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"
- "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
- "MODEL_NAME=DeepSeek-R1-0528"

- isl: 8192
osl: 1024
search-space:
# MTP configurations
# DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
# # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
# - spec-decoding: "mtp"
# conc-list: [ 512, 1024 ]
# prefill:
# num-worker: 2
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "PREFILL_NODES=2"
# decode:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "DECODE_NODES=1"
# - "DECODE_MTP_SIZE=1"

# "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
- spec-decoding: "mtp"
conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 2
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=3"

# "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
- spec-decoding: "mtp"
conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=3"
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ mi325x:
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi325x-disagg:
- 'mi325x-amd_0'
- 'mi325x-amd_1'
- 'mi325x-amd_2'
- 'mi325x-amd_3'
mi355x:
- 'mi355x-amds_0'
- 'mi355x-amds_1'
Expand Down
Loading
Loading