vllm-project · simon-mo · Oct 31, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# Args: [THRESH] [NUM_QUESTIONS] [START_PORT]
+THRESH=${1:-0.25}
+NUM_Q=${2:-1319}
+PORT=${3:-8010}
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="deepseek-ai/DeepSeek-V2-lite"
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --enable-eplb \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  wait_for_server $PORT
+
+  OUT=/tmp/${MODEL}_${BACK}.json
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESH}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  pkill -f "vllm serve" || true
+  sleep 2
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# Args: [THRESH] [NUM_QUESTIONS] [START_PORT]
+THRESH=${1:-0.8}
+NUM_Q=${2:-1319}
+PORT=${3:-8020}
+
+wait_for_server() {
+  local port=$1
+  timeout 600 bash -c '
+    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
+      sleep 1
+    done'
+}
+
+MODEL="QWen/Qwen3-30B-A3B-FP8"
+BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+for BACK in "${BACKENDS[@]}"; do
+  VLLM_ALL2ALL_BACKEND=$BACK \
+  vllm serve "$MODEL" \
+    --enforce-eager \
+    --tensor-parallel-size 2 \
+    --data-parallel-size 2 \
+    --enable-expert-parallel \
+    --trust-remote-code \
+    --max-model-len 2048 \
+    --port $PORT &
+  wait_for_server $PORT
+
+  OUT=/tmp/qwen30b_a3b_fp8_block_${BACK}.json
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 - <<PY
+import json; acc=json.load(open('${OUT}'))['accuracy']
+print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
+assert acc >= ${THRESH}, f"${MODEL} ${BACK} accuracy {acc}"
+PY
+
+  pkill -f "vllm serve" || true
+  sleep 2
+  PORT=$((PORT+1))
+done
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -1203,3 +1203,21 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy (EP+EPLB; deepep HT/LL)
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (EP; deepep HT/LL)
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020