-
Notifications
You must be signed in to change notification settings - Fork 122
[AMD] [code not in mergable state yet][blocker waiting for more nodes to speed up dev iteration speed] mi325 sglang disagg #985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
834fe82
7b50476
b40908c
2421ca5
6abdf85
3716258
db677bd
0a485de
67dec7c
f18257f
3ccfba3
0213032
2afb24a
36aebfd
b5a0bc2
beb3808
23c2931
e5b9d00
76d89d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1231,3 +1231,152 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: | |
| - "DECODE_NODES=1" | ||
| - "DECODE_MTP_SIZE=1" | ||
|
|
||
|
|
||
| dsr1-fp8-mi325x-sglang-disagg: | ||
| image: ghcr.io/jordannanos/sgl-mi325x-mori:v0.5.9-bnxt | ||
|
||
| model: deepseek-ai/DeepSeek-R1-0528 | ||
| model-prefix: dsr1 | ||
| runner: mi325x-disagg | ||
| precision: fp8 | ||
| framework: sglang-disagg | ||
| multinode: true | ||
| disagg: true | ||
| seq-len-configs: | ||
| - isl: 1024 | ||
| osl: 1024 | ||
| search-space: | ||
| # "Top of curve" (1 prefill worker at TP8, 1 decode worker at DEP8) | ||
| - spec-decoding: "none" | ||
| conc-list: [ 512, 1024 ] | ||
| prefill: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "PREFILL_NODES=1" | ||
| decode: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 8 | ||
| dp-attn: true | ||
| additional-settings: | ||
| - "DECODE_NODES=2" | ||
| - "DECODE_MTP_SIZE=0" | ||
|
|
||
| # "Middle of curve" (1 prefill worker at TP8, 2 decode workers at DEP8) | ||
| - spec-decoding: "none" | ||
| conc-list: [ 768, 512, 256 ] | ||
| prefill: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "PREFILL_NODES=1" | ||
| decode: | ||
| num-worker: 2 | ||
| tp: 8 | ||
| ep: 8 | ||
| dp-attn: true | ||
| additional-settings: | ||
| - "DECODE_NODES=2" | ||
| - "DECODE_MTP_SIZE=0" | ||
|
|
||
| # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) | ||
| - spec-decoding: "none" | ||
| conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] | ||
| prefill: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "PREFILL_NODES=1" | ||
| decode: | ||
| num-worker: 2 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "DECODE_NODES=2" | ||
| - "DECODE_MTP_SIZE=0" | ||
|
|
||
| # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. r u sure that TP4 is on the pareto here? do u have an graph?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. u only have TP4 curve and u have "hide non-optimal"? can u run the rest of the 24 datapoints? |
||
| - spec-decoding: "none" | ||
| conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] | ||
| prefill: | ||
| num-worker: 1 | ||
| tp: 4 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "PREFILL_NODES=1" | ||
| decode: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "DECODE_NODES=1" | ||
| - "DECODE_MTP_SIZE=0" | ||
|
|
||
functionstackx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| - isl: 8192 | ||
| osl: 1024 | ||
| search-space: | ||
| # "Top of curve" (2 prefill workers at DEP8, 1 decode worker at DEP8) | ||
| - spec-decoding: "none" | ||
| conc-list: [ 512, 1024 ] | ||
| prefill: | ||
| num-worker: 2 | ||
| tp: 8 | ||
| ep: 8 | ||
| dp-attn: true | ||
| additional-settings: | ||
| - "PREFILL_NODES=2" | ||
| decode: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 8 | ||
| dp-attn: true | ||
| additional-settings: | ||
| - "DECODE_NODES=1" | ||
| - "DECODE_MTP_SIZE=0" | ||
|
|
||
| # "Bottom of curve" (1 prefill worker at TP8, 2 decode workers at TP8) | ||
| - spec-decoding: "none" | ||
| conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] | ||
| prefill: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "PREFILL_NODES=1" | ||
| decode: | ||
| num-worker: 2 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "DECODE_NODES=2" | ||
| - "DECODE_MTP_SIZE=0" | ||
|
|
||
| # "Low concurrency" (1 prefill worker at TP4, 1 decode worker at TP8) | ||
| - spec-decoding: "none" | ||
| conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] | ||
| prefill: | ||
| num-worker: 1 | ||
| tp: 4 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "PREFILL_NODES=1" | ||
| decode: | ||
| num-worker: 1 | ||
| tp: 8 | ||
| ep: 1 | ||
| dp-attn: false | ||
| additional-settings: | ||
| - "DECODE_NODES=1" | ||
| - "DECODE_MTP_SIZE=0" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then | |
| exit 1 | ||
| fi | ||
|
|
||
| # Validate MODEL_NAME exists as a top-level key in models.yaml | ||
| if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then | ||
| echo "Error: Model '$MODEL_NAME' not found in models.yaml" | ||
| # MODEL_YAML_KEY is the models.yaml lookup key (bare model name, e.g. DeepSeek-R1-0528). | ||
| # MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>). | ||
| _MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}" | ||
|
|
||
| # Validate the yaml key exists as a top-level key in models.yaml | ||
| if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then | ||
| echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml" | ||
| echo "Available models:" | ||
| grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' | ||
| exit 1 | ||
| fi | ||
| echo "Model found: $MODEL_NAME" | ||
| echo "Model found: $_MODEL_YAML_KEY" | ||
|
|
||
| # All models use server.sh as the entrypoint | ||
| RUN_FILE="server.sh" | ||
|
|
@@ -249,10 +253,9 @@ echo "NNODES is ${NNODES}" | |
| echo "REPO Directory is ${DI_REPO_DIR}" | ||
| echo "USER_NAME is ${USER_NAME}" | ||
|
|
||
| # Get the RDMA priority and DSCP value from the NIC | ||
| # Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully) | ||
| if ! command -v nicctl >/dev/null 2>&1; then | ||
| echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 | ||
| exit 1 | ||
| echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2 | ||
| fi | ||
|
|
||
| # Reduce log spam | ||
|
|
@@ -357,7 +360,7 @@ exec sudo docker run --rm \ | |
| --privileged \ | ||
| -v ${MODEL_DIR}:/models \ | ||
| -v \$HOME/.ssh:/root/.ssh \ | ||
| -v $(which nicctl):/usr/sbin/nicctl \ | ||
| $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \ | ||
|
Comment on lines
256
to
+411
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u verify if these changes break mi355 disagg? +viz @Oseltamivir
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the check for nicctl was breaking on this cluster, MoRI needs it to enforce QoS, disabled for now as it's not installed on these nodes or in the container built and seems unnecessary |
||
| --shm-size 128G \ | ||
| -v /tmp:/run_logs \ | ||
| -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ | ||
|
|
@@ -373,6 +376,7 @@ exec sudo docker run --rm \ | |
| -e xP=\$xP \ | ||
| -e yD=\$yD \ | ||
| -e MODEL_NAME=\$MODEL_NAME \ | ||
| -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \ | ||
| -e IPADDRS=\$IPADDRS \ | ||
| -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ | ||
| -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| source "$(dirname "$0")/../benchmark_lib.sh" | ||
|
|
||
| check_env_vars \ | ||
| CONC_LIST \ | ||
| ISL \ | ||
| OSL \ | ||
| IMAGE \ | ||
| SPEC_DECODING \ | ||
| MODEL_PATH \ | ||
| PREFILL_NUM_WORKERS \ | ||
| PREFILL_TP \ | ||
| PREFILL_EP \ | ||
| PREFILL_DP_ATTN \ | ||
| DECODE_NUM_WORKERS \ | ||
| DECODE_TP \ | ||
| DECODE_EP \ | ||
| DECODE_DP_ATTN \ | ||
| PREFILL_NODES \ | ||
| DECODE_NODES \ | ||
| RANDOM_RANGE_RATIO | ||
|
|
||
| if [[ -n "$SLURM_JOB_ID" ]]; then | ||
| echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" | ||
| fi | ||
|
|
||
| set -x | ||
|
|
||
| # Use upstreamed multi_node scripts (no external clone needed) | ||
| cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 | ||
|
|
||
| # Set up SGL launch script-specific environment variables | ||
| export TIME_LIMIT="08:00:00" | ||
| export MODEL_PATH=$MODEL_PATH | ||
| export MODEL_NAME=$MODEL_NAME | ||
| export CONTAINER_IMAGE=$IMAGE | ||
|
|
||
| if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then | ||
| export PREFILL_ENABLE_EP=false | ||
| else | ||
| export PREFILL_ENABLE_EP=true | ||
| fi | ||
|
|
||
| if [[ "$PREFILL_DP_ATTN" == "true" ]]; then | ||
| export PREFILL_ENABLE_DP=true | ||
| else | ||
| export PREFILL_ENABLE_DP=false | ||
| fi | ||
|
|
||
| if [[ "${DECODE_EP:-1}" -eq 1 ]]; then | ||
| export DECODE_ENABLE_EP=false | ||
| else | ||
| export DECODE_ENABLE_EP=true | ||
| fi | ||
|
|
||
| if [[ "$DECODE_DP_ATTN" == "true" ]]; then | ||
| export DECODE_ENABLE_DP=true | ||
| else | ||
| export DECODE_ENABLE_DP=false | ||
| fi | ||
|
|
||
| # Launch jobs based on ISL/OSL | ||
| # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented | ||
| # by a list of numbers delimited by 'x'. This is because of how the underlying launch script | ||
| # expects the concurrencies. | ||
| JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ | ||
| $PREFILL_NUM_WORKERS \ | ||
| $DECODE_NODES \ | ||
| $DECODE_NUM_WORKERS \ | ||
| $ISL $OSL "${CONC_LIST// /x}" inf \ | ||
| ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ | ||
| ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ | ||
| ${PREFILL_TP} ${DECODE_TP} \ | ||
| ${RANDOM_RANGE_RATIO}) | ||
|
|
||
| if [[ $? -ne 0 ]]; then | ||
| echo "Failed to submit job" >&2 | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "$JOB_ID" |

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ur missing perfchange log . yaml too