SemiAnalysisAI · JordanNanos · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
@@ -94,6 +94,24 @@ dsr1-fp8-mi325x-sglang:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
+dsr1-fp8-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
 dsr1-fp8-mi355x-sglang:
   image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
   model: deepseek-ai/DeepSeek-R1-0528
@@ -1231,3 +1249,357 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp8-mi325x-sglang-disagg:
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 768, 512, 256 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # Single-node EP8/DP decode — workaround: use default a2a kernels instead of
+    # MoRI a2a (which hangs on Broadcom bnxt_re). See sgl-project/sglang#22072
+    # Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8.
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "none"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=0"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp8-mi325x-sglang-disagg-mtp:
+  image: semianalysiswork/sgl-cdna3-mori:v0.5.9-bnxt
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 768, 512, 256 ]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "PREFILL_NODES=1"
+    #   decode:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=2"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+    # Single-node EP8/DP decode with MTP — workaround: default a2a kernels
+    # Concurrency capped at 64: bnxt_re SQ fills up at higher concurrency under EP8.
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+        - "MODEL_YAML_KEY=DeepSeek-R1-0528-bnxt"
+        - "MODEL_NAME=DeepSeek-R1-0528"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # DISABLED: EP8/DP configs fail at runtime on MI325X (MoRI/RDMA issue with Broadcom Thor 2)
+    # # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8)
+    # - spec-decoding: "mtp"
+    #   conc-list: [ 512, 1024 ]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "PREFILL_NODES=2"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "DECODE_NODES=1"
+    #     - "DECODE_MTP_SIZE=1"
+
+    # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8)
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -75,6 +75,11 @@ mi325x:
 - 'mi325x-amd_1'
 - 'mi325x-amd_2'
 - 'mi325x-amd_3'
+mi325x-disagg:
+- 'mi325x-amd_0'
+- 'mi325x-amd_1'
+- 'mi325x-amd_2'
+- 'mi325x-amd_3'
 mi355x:
 - 'mi355x-amds_0'
 - 'mi355x-amds_1'