Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
368ad79
Wip
ilmarkov Sep 20, 2025
e8aadae
Fix precommit
ilmarkov Sep 22, 2025
98395a6
Fix other mtp models
ilmarkov Sep 22, 2025
cda869d
Add eplb support to Llama4
ilmarkov Sep 22, 2025
7a519ee
Fix mllama4
ilmarkov Sep 23, 2025
ec2b02a
Refactor multi model eplb support
ilmarkov Sep 23, 2025
ca98544
Add test and fix
ilmarkov Sep 24, 2025
eeaca8f
Merge branch 'main' into fix_eplb_mtp
ilmarkov Sep 24, 2025
c161489
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 7, 2025
e713f42
Update spec decode
ilmarkov Oct 7, 2025
a70a344
init
SageMoore Oct 7, 2025
3b51ef9
comment
SageMoore Oct 7, 2025
123c8e6
Update qwen next
ilmarkov Oct 8, 2025
9149d25
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 13, 2025
27b6437
Cleanup
ilmarkov Oct 13, 2025
ff9f992
Update after review
ilmarkov Oct 14, 2025
d4532a6
Update buildkite pipeline test time
ilmarkov Oct 15, 2025
b0c8cd3
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 15, 2025
7c5b5b1
Improve sync. Update after review
ilmarkov Oct 23, 2025
96d4b37
Fix comment
ilmarkov Oct 23, 2025
43755f6
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 23, 2025
477a955
Refactor
ilmarkov Oct 27, 2025
6880c9f
Refactor glm4
ilmarkov Oct 27, 2025
4ab42aa
Update moemixin
ilmarkov Oct 27, 2025
bf4dcbc
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 27, 2025
7d0ee28
Update comment for V1 Test e2e + engine
ilmarkov Oct 28, 2025
d129097
Update startup logging
ilmarkov Oct 27, 2025
a77b99f
Update test
ilmarkov Oct 28, 2025
ef3c9a1
Upd test constants
ilmarkov Oct 28, 2025
7e60b26
Upd test time
ilmarkov Oct 28, 2025
df918b2
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 28, 2025
f4fad37
Upd
ilmarkov Oct 29, 2025
644c328
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Oct 29, 2025
94e3390
Fix glm4moe
ilmarkov Oct 31, 2025
69786a5
Merge branch 'main' into imarkov/fix_eplb_mtp
tlrmchlsmth Oct 31, 2025
09f9869
Fix CI
ilmarkov Oct 31, 2025
74f806b
Update gpu_memory_utilization to 0.93
ilmarkov Oct 31, 2025
0e8dc73
Fix
ilmarkov Nov 2, 2025
7f4b831
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Nov 2, 2025
b88f680
Fix oom
ilmarkov Nov 2, 2025
70b66a7
Merge branch 'main' into imarkov/fix_eplb_mtp
LucasWilkinson Nov 4, 2025
6e17f0f
Merge remote-tracking branch 'origin/main' into imarkov/fix_eplb_mtp
ilmarkov Nov 4, 2025
e4fa241
Update moe_layers. Clean OpenPangu
ilmarkov Nov 4, 2025
deb21b1
Fix mypy
ilmarkov Nov 4, 2025
a9938e7
Merge branch 'main' into imarkov/fix_eplb_mtp
ilmarkov Nov 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ steps:
- pytest -v -s tokenization

- label: V1 Test e2e + engine # 30min
timeout_in_minutes: 45
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Does your test trip the timeout?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We add new test that might take up to 15 minutes, so need to increase the timeout

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does seem a bit excessive for a somewhat niche use case. I'm admittedly not well versed in the CI hierarchy, but would it make more sense to just run one model here and the rest in a nightly?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's please not add a test that takes 15 minutes

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, it only takes 5 minutes. Good enough - Let's add this to the EPLB execution test instead:

- label: EPLB Execution Test # 5min
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py

timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
Expand Down
119 changes: 119 additions & 0 deletions tests/v1/e2e/test_eplb_spec_decode.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add a comment describing what this test is for?

Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations

import pytest
import torch

from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory


def create_test_prompts() -> list[str]:
return [
"A robot may not injure a human being",
"To be or not to be,",
"What is the meaning of life?",
]


@pytest.fixture
def sampling_config():
return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)


@pytest.mark.parametrize(
"model_setup",
[
("meta-llama/Llama-4-Scout-17B-16E-Instruct", 4),
],
ids=["llama4"],
)
def test_eplb_model(
monkeypatch: pytest.MonkeyPatch,
sampling_config: SamplingParams,
model_setup: tuple[str, int],
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1")

model_name, tp_size = model_setup
test_prompts = create_test_prompts()
llm = LLM(
model=model_name,
tensor_parallel_size=tp_size,
max_model_len=2048,
enable_expert_parallel=True,
num_redundant_experts=tp_size,
eplb_window_size=4,
eplb_step_interval=16,
eplb_log_balancedness=True,
enable_eplb=True,
load_format="dummy",
gpu_memory_utilization=0.95,
)
test_prompts = create_test_prompts()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this is called twice, one of the calls can be deleted

llm.generate(test_prompts, sampling_config)
del llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()


@pytest.mark.parametrize(
"model_setup",
[
(
"eagle",
"eagle618/deepseek-v3-random",
"eagle618/eagle-deepseek-v3-random",
4,
),
("deepseek_mtp", "eagle618/deepseek-v3-random", None, 4),
("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4),
pytest.param(
(
"eagle",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
4,
),
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"),
),
],
ids=["deepseek_eagle", "deepseek_mtp", "qwen3_next_mtp", "llama4_eagle"],
)
def test_eplb_spec_decode(
monkeypatch: pytest.MonkeyPatch,
sampling_config: SamplingParams,
model_setup: tuple[str, str, str, int],
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1")

method, model_name, spec_model_name, tp_size = model_setup
llm = LLM(
model=model_name,
trust_remote_code=True,
tensor_parallel_size=tp_size,
speculative_config={
"method": method,
"model": spec_model_name,
"num_speculative_tokens": 1,
"max_model_len": 2048,
},
max_model_len=2048,
enable_expert_parallel=True,
num_redundant_experts=tp_size,
eplb_window_size=1000,
eplb_step_interval=3000,
eplb_log_balancedness=True,
enable_eplb=True,
load_format="dummy",
)
test_prompts = create_test_prompts()
llm.generate(test_prompts, sampling_config)
del llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
Loading