Skip to content

Commit e50c454

Browse files
ilmarkovSageMooretlrmchlsmthLucasWilkinson
authored
[BugFix] Support EP/DP + EPLB with MTP (#25311)
Signed-off-by: ilmarkov <[email protected]> Signed-off-by: Sage Moore <[email protected]> Co-authored-by: Sage Moore <[email protected]> Co-authored-by: Tyler Michael Smith <[email protected]> Co-authored-by: Lucas Wilkinson <[email protected]>
1 parent 5d16d0f commit e50c454

File tree

27 files changed

+956
-528
lines changed

27 files changed

+956
-528
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,15 +232,16 @@ steps:
232232
commands:
233233
- pytest -v -s distributed/test_eplb_algo.py
234234

235-
- label: EPLB Execution Test # 5min
236-
timeout_in_minutes: 15
235+
- label: EPLB Execution Test # 10min
236+
timeout_in_minutes: 20
237237
working_dir: "/vllm-workspace/tests"
238238
num_gpus: 4
239239
source_file_dependencies:
240240
- vllm/distributed/eplb
241241
- tests/distributed/test_eplb_execute.py
242242
commands:
243243
- pytest -v -s distributed/test_eplb_execute.py
244+
- pytest -v -s distributed/test_eplb_spec_decode.py
244245

245246
- label: Metrics, Tracing Test # 12min
246247
timeout_in_minutes: 20
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
from __future__ import annotations
4+
5+
import lm_eval
6+
import pytest
7+
8+
from tests.utils import large_gpu_mark
9+
10+
11+
def get_model_args(
12+
model_name: str,
13+
spec_model_name: str,
14+
spec_method: str,
15+
tp_size: int,
16+
model_max_len: int,
17+
) -> dict:
18+
speculative_config = {
19+
"method": spec_method,
20+
"model": spec_model_name,
21+
"num_speculative_tokens": 1,
22+
"max_model_len": model_max_len,
23+
}
24+
25+
model_args = {
26+
"pretrained": model_name,
27+
"dtype": "auto",
28+
"add_bos_token": True,
29+
"tensor_parallel_size": tp_size,
30+
"gpu_memory_utilization": 0.7,
31+
"speculative_config": speculative_config,
32+
"enable_expert_parallel": True,
33+
"num_redundant_experts": tp_size,
34+
"eplb_window_size": 128,
35+
"eplb_step_interval": 1024,
36+
"eplb_log_balancedness": False,
37+
"enable_eplb": True,
38+
"max_model_len": model_max_len,
39+
}
40+
return model_args
41+
42+
43+
@pytest.mark.parametrize(
44+
"model_setup",
45+
[
46+
pytest.param(
47+
("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86),
48+
marks=large_gpu_mark(min_gb=80),
49+
),
50+
pytest.param(
51+
(
52+
"eagle",
53+
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
54+
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
55+
4,
56+
0.92,
57+
),
58+
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
59+
),
60+
],
61+
ids=["qwen3_next_mtp", "llama4_eagle"],
62+
)
63+
def test_eplb_spec_decode(
64+
monkeypatch: pytest.MonkeyPatch,
65+
model_setup: tuple[str, str, str, int, float],
66+
):
67+
"""
68+
Test the correctness of EPLB speculative decoding with GSM8K dataset.
69+
Applicable to MoE models with mtp or eagle spec decode.
70+
"""
71+
method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup
72+
73+
TASK = "gsm8k"
74+
FILTER = "exact_match,strict-match"
75+
RTOL = 0.03
76+
77+
model_args = get_model_args(
78+
model_name=model_name,
79+
spec_model_name=spec_model_name,
80+
spec_method=method,
81+
tp_size=tp_size,
82+
model_max_len=4096,
83+
)
84+
85+
results = lm_eval.simple_evaluate(
86+
model="vllm",
87+
model_args=model_args,
88+
tasks=TASK,
89+
batch_size=64,
90+
num_fewshot=8,
91+
)
92+
measured_value = results["results"][TASK][FILTER]
93+
assert (
94+
measured_value - RTOL < expected_gsm8k_value
95+
and measured_value + RTOL > expected_gsm8k_value
96+
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"

0 commit comments

Comments
 (0)