From 368ad79d9b969b58198bd570c81cc74fc388e0ca Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Sat, 20 Sep 2025 04:27:43 -0700 Subject: [PATCH 01/32] Wip Signed-off-by: ilmarkov --- vllm/distributed/eplb/eplb_state.py | 31 +++++++++++++ vllm/model_executor/models/deepseek_mtp.py | 48 ++++++++++++++++++-- vllm/v1/spec_decode/eagle.py | 53 +++++++++++++++++++++- vllm/v1/spec_decode/medusa.py | 53 +++++++++++++++++++++- vllm/v1/worker/gpu_model_runner.py | 35 +++----------- 5 files changed, 185 insertions(+), 35 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 3e318d784832..d5da638970f7 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -582,6 +582,37 @@ def recv_state() -> tuple[torch.Tensor, torch.Tensor]: return global_expert_load, old_global_expert_indices + @classmethod + def get_epp_state( + cls, + parallel_config: ParallelConfig, + eep_scale_up: bool = False + ) -> tuple[torch.Tensor, torch.Tensor, dict[int, int]]: + if not eep_scale_up: + return None, None, None + num_local_physical_experts = torch.empty(1, + dtype=torch.int32, + device="cpu") + torch.distributed.broadcast(num_local_physical_experts, + group=get_ep_group().cpu_group, + group_src=0) + num_local_physical_experts = int(num_local_physical_experts.item()) + new_ep_size = get_ep_group().world_size + global_expert_load, old_global_expert_indices = ( + EplbState.recv_state()) + num_logical_experts = global_expert_load.shape[1] + parallel_config.eplb_config.num_redundant_experts = ( + num_local_physical_experts * new_ep_size - num_logical_experts) + assert old_global_expert_indices.shape[ + 1] % num_local_physical_experts == 0 + old_ep_size = old_global_expert_indices.shape[ + 1] // num_local_physical_experts + rank_mapping = { + old_ep_rank: old_ep_rank + for old_ep_rank in range(old_ep_size) + } + return global_expert_load, old_global_expert_indices, rank_mapping + def _node_count_with_rank_mapping( pg: Union[ProcessGroup, StatelessProcessGroup], diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 92f311ab465b..ae7148e659c3 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -17,9 +17,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors -from .deepseek_v2 import (DeepseekV2DecoderLayer, +from .deepseek_v2 import (DeepseekV2DecoderLayer, DeepseekV2MoE, get_spec_layer_idx_from_weight_name) -from .interfaces import SupportsPP +from .interfaces import MixtureOfExperts, SupportsPP from .utils import maybe_prefix @@ -133,7 +133,7 @@ def compute_logits( return logits -class DeepSeekMTP(nn.Module, SupportsPP): +class DeepSeekMTP(nn.Module, SupportsPP, MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -141,6 +141,48 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = DeepSeekMultiTokenPredictor(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "model")) + config = vllm_config.model_config.hf_config + + self.expert_weights = [] + # Set MoE hyperparameters + self.num_moe_layers = config.num_nextn_predict_layers + self.num_expert_groups = config.n_group + + self.moe_layers: list[FusedMoE] = [] + example_moe = None + for layer in self.model.layers.values(): + assert isinstance(layer, DeepSeekMultiTokenPredictorLayer) + layer = layer.mtp_block + assert isinstance(layer, DeepseekV2DecoderLayer) + if isinstance(layer.mlp, DeepseekV2MoE): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError("No DeepseekV2MoE layer found in model.layers.") + + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) def forward( self, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index dc97d5c8f39d..05ec6435afeb 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -12,11 +12,13 @@ from vllm.attention.layer import Attention from vllm.config import (CompilationLevel, VllmConfig, get_layers_from_vllm_config) +from vllm.distributed.eplb.eplb_state import EplbState from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal +from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -62,6 +64,9 @@ def __init__( self.method = self.speculative_config.method self.runner = runner + self.eplb_state: Optional[EplbState] = None + self.device = device + self.dtype = vllm_config.model_config.dtype self.max_model_len = vllm_config.model_config.max_model_len self.block_size = vllm_config.cache_config.block_size @@ -371,6 +376,9 @@ def propose( draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) + # EPLB step + self.eplb_step() + # [batch_size, num_speculative_tokens] draft_token_ids = torch.stack(draft_token_ids_list, dim=1) return draft_token_ids @@ -798,7 +806,13 @@ def prepare_inputs( return spec_common_attn_metadata, token_indices - def load_model(self, target_model: nn.Module) -> None: + def load_model(self, + target_model: nn.Module, + eep_scale_up: bool = False) -> None: + global_expert_load, old_global_expert_indices, rank_mapping = \ + EplbState.get_epp_state(self.vllm_config.parallel_config, + eep_scale_up) + draft_model_config = \ self.vllm_config.speculative_config.draft_model_config target_attn_layer_names = set( @@ -859,10 +873,45 @@ def load_model(self, target_model: nn.Module) -> None: logger.info("Loading EAGLE LM head weights from the target model.") self.model.lm_head = target_language_model.lm_head + if is_mixture_of_experts( + self.model) and self.vllm_config.parallel_config.enable_eplb: + logger.info("EPLB is enabled for Eagle drafter model %s.", + draft_model_config.model) + + self.eplb_state = EplbState.build( + self.model, + self.device, + self.vllm_config.parallel_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, + ) + + def eplb_step(self, + is_dummy: bool = False, + is_profile: bool = False) -> None: + """ + Step for the EPLB (Expert Parallelism Load Balancing) state. + """ + if not self.vllm_config.parallel_config.enable_eplb: + return + + assert self.eplb_state is not None + assert is_mixture_of_experts(self.model) + self.eplb_state.step( + self.model, + is_dummy, + is_profile, + log_stats=self.vllm_config.parallel_config.eplb_config. + log_balancedness, + ) + @torch.inference_mode() def dummy_run( self, num_tokens: int, + skip_eplb: bool = False, + is_profile: bool = False, ) -> None: with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): @@ -879,6 +928,8 @@ def dummy_run( hidden_states=self.hidden_states[:num_tokens], inputs_embeds=inputs_embeds, ) + if not skip_eplb: + self.eplb_step(is_dummy=True, is_profile=is_profile) def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 70b29c05c2a5..b910e9c1bb4f 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -1,13 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional import torch import torch.nn as nn from vllm.config import VllmConfig +from vllm.distributed.eplb.eplb_state import EplbState from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.v1.sample.metadata import SamplingMetadata # Initialize logger @@ -33,6 +36,8 @@ def __init__( draft_model_config.get_hidden_size( ) self.dtype = vllm_config.model_config.dtype + self.eplb_state: Optional[EplbState] = None + self.device = device def propose( self, @@ -49,18 +54,62 @@ def propose( draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits] return [list(row) for row in zip(*draft_tokens)] - def load_model(self, target_model: nn.Module) -> None: + def eplb_step(self, + is_dummy: bool = False, + is_profile: bool = False) -> None: + """ + Step for the EPLB (Expert Parallelism Load Balancing) state. + """ + if not self.vllm_config.parallel_config.enable_eplb: + return + + assert self.eplb_state is not None + assert is_mixture_of_experts(self.model) + self.eplb_state.step( + self.model, + is_dummy, + is_profile, + log_stats=self.vllm_config.parallel_config.eplb_config. + log_balancedness, + ) + + def load_model(self, + target_model: nn.Module, + eep_scale_up: bool = False) -> None: + global_expert_load, old_global_expert_indices, rank_mapping = \ + EplbState.get_epp_state(self.vllm_config.parallel_config, + eep_scale_up) + from vllm.compilation.backends import set_model_tag with set_model_tag("medusa_head"): self.model = get_model(vllm_config=self.vllm_config, model_config=self.vllm_config. speculative_config.draft_model_config) + if is_mixture_of_experts( + self.model) and self.vllm_config.parallel_config.enable_eplb: + logger.info( + "EPLB is enabled for Eagle drafter model %s.", + self.vllm_config.speculative_config.draft_model_config.model) + + self.eplb_state = EplbState.build( + self.model, + self.device, + self.vllm_config.parallel_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, + ) @torch.inference_mode() - def dummy_run(self, num_tokens: int) -> None: + def dummy_run(self, + num_tokens: int, + skip_eplb: bool = False, + is_profile: bool = False) -> None: hidden_states = torch.zeros((self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=self.device) with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): self.model(hidden_states) + if not skip_eplb: + self.eplb_step(is_dummy=True, is_profile=is_profile) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b0cd0f413307..572afcb38352 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2527,33 +2527,8 @@ def load_model(self, eep_scale_up: bool = False) -> None: eep_scale_up: the model loading is for elastic EP scale up. """ logger.info("Starting to load model %s...", self.model_config.model) - if eep_scale_up: - from vllm.distributed.parallel_state import get_ep_group - num_local_physical_experts = torch.empty(1, - dtype=torch.int32, - device="cpu") - torch.distributed.broadcast(num_local_physical_experts, - group=get_ep_group().cpu_group, - group_src=0) - num_local_physical_experts = int(num_local_physical_experts.item()) - new_ep_size = get_ep_group().world_size - global_expert_load, old_global_expert_indices = ( - EplbState.recv_state()) - num_logical_experts = global_expert_load.shape[1] - self.parallel_config.eplb_config.num_redundant_experts = ( - num_local_physical_experts * new_ep_size - num_logical_experts) - assert old_global_expert_indices.shape[ - 1] % num_local_physical_experts == 0 - old_ep_size = old_global_expert_indices.shape[ - 1] // num_local_physical_experts - rank_mapping = { - old_ep_rank: old_ep_rank - for old_ep_rank in range(old_ep_size) - } - else: - global_expert_load = None - old_global_expert_indices = None - rank_mapping = None + global_expert_load, old_global_expert_indices, rank_mapping = \ + EplbState.get_epp_state(self.parallel_config, eep_scale_up) with DeviceMemoryProfiler() as m: time_before_load = time.perf_counter() @@ -2566,7 +2541,7 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.device) if hasattr(self, "drafter"): logger.info("Loading drafter model...") - self.drafter.load_model(self.model) + self.drafter.load_model(self.model, eep_scale_up=eep_scale_up) if self.use_aux_hidden_state_outputs: if supports_eagle3(self.model): self.model.set_aux_hidden_state_layers( @@ -3079,7 +3054,9 @@ def _dummy_run( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - self.drafter.dummy_run(num_tokens) + self.drafter.dummy_run(num_tokens, + skip_eplb=skip_eplb, + is_profile=is_profile) # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real From e8aadaef6fd3ec41485ae67899366bfdaf2135ee Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 22 Sep 2025 01:39:48 -0700 Subject: [PATCH 02/32] Fix precommit Signed-off-by: ilmarkov --- vllm/distributed/eplb/eplb_state.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index d5da638970f7..93ad2bd569ff 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -587,7 +587,8 @@ def get_epp_state( cls, parallel_config: ParallelConfig, eep_scale_up: bool = False - ) -> tuple[torch.Tensor, torch.Tensor, dict[int, int]]: + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[dict[ + int, int]]]: if not eep_scale_up: return None, None, None num_local_physical_experts = torch.empty(1, From 98395a6a96c2b07bd1ff19e63dbf090d40af54d9 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 22 Sep 2025 12:20:35 -0700 Subject: [PATCH 03/32] Fix other mtp models Signed-off-by: ilmarkov --- vllm/model_executor/models/deepseek_mtp.py | 21 +++++++ vllm/model_executor/models/glm4_moe_mtp.py | 48 ++++++++++++++- vllm/model_executor/models/qwen3_next_mtp.py | 65 +++++++++++++++++++- vllm/v1/spec_decode/eagle.py | 4 +- vllm/v1/spec_decode/medusa.py | 40 +----------- vllm/v1/worker/gpu_model_runner.py | 5 +- vllm/v1/worker/gpu_worker.py | 56 ++++++++++++----- 7 files changed, 175 insertions(+), 64 deletions(-) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index ae7148e659c3..a23d5c5c6dca 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -184,6 +184,27 @@ def set_eplb_state( logical_replica_count=logical_replica_count, ) + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers.values(): + assert isinstance(layer, DeepSeekMultiTokenPredictorLayer) + layer = layer.mtp_block + assert isinstance(layer, DeepseekV2DecoderLayer) + if isinstance(layer.mlp, DeepseekV2MoE): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index c572978e6220..eb4f84700b09 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -40,8 +40,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors -from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name -from .interfaces import SupportsPP +from .glm4_moe import (Glm4MoE, Glm4MoeDecoderLayer, + get_spec_layer_idx_from_weight_name) +from .interfaces import MixtureOfExperts, SupportsPP from .utils import maybe_prefix @@ -164,7 +165,7 @@ def compute_logits( return logits -class Glm4MoeMTP(nn.Module, SupportsPP): +class Glm4MoeMTP(nn.Module, SupportsPP, MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -172,6 +173,47 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = Glm4MoeMultiTokenPredictor(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "model")) + self.expert_weights = [] + + # Set MoE hyperparameters + self.num_moe_layers = self.config.num_nextn_predict_layers + self.num_expert_groups = self.config.n_group + + self.moe_layers: list[FusedMoE] = [] + example_moe = None + for layer in self.model.layers.values(): + assert isinstance(layer, Glm4MoeMultiTokenPredictor) + layer = layer.mtp_block + assert isinstance(layer, Glm4MoeDecoderLayer) + if isinstance(layer.mlp, Glm4MoE): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError("No Glm4MoE layer found in model.layers.") + + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) def forward( self, diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index c054339842e6..7c86dca45d21 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -18,11 +18,12 @@ DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen3_next import (Qwen3NextDecoderLayer, - Qwen3NextRMSNorm) + Qwen3NextRMSNorm, + Qwen3NextSparseMoeBlock) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Qwen3NextConfig -from .interfaces import SupportsPP +from .interfaces import MixtureOfExperts, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, maybe_prefix) @@ -211,7 +212,7 @@ def load_weights(self, weights: Iterable[tuple[str, @support_torch_compile -class Qwen3NextMTP(nn.Module, SupportsPP): +class Qwen3NextMTP(nn.Module, SupportsPP, MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -246,6 +247,64 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_moe = None + for layer in self.model.layers.values(): + assert isinstance(layer, Qwen3NextDecoderLayer) + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError("No Qwen3Next layer found in the model.layers.") + + # Set MoE hyperparameters + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers.values(): + assert isinstance(layer, Qwen3NextDecoderLayer) + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 05ec6435afeb..2beb0537c99c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -893,11 +893,11 @@ def eplb_step(self, """ Step for the EPLB (Expert Parallelism Load Balancing) state. """ - if not self.vllm_config.parallel_config.enable_eplb: + if not self.vllm_config.parallel_config.enable_eplb or \ + not is_mixture_of_experts(self.model): return assert self.eplb_state is not None - assert is_mixture_of_experts(self.model) self.eplb_state.step( self.model, is_dummy, diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index b910e9c1bb4f..4a0e3f4cdfb0 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -54,51 +54,15 @@ def propose( draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits] return [list(row) for row in zip(*draft_tokens)] - def eplb_step(self, - is_dummy: bool = False, - is_profile: bool = False) -> None: - """ - Step for the EPLB (Expert Parallelism Load Balancing) state. - """ - if not self.vllm_config.parallel_config.enable_eplb: - return - - assert self.eplb_state is not None - assert is_mixture_of_experts(self.model) - self.eplb_state.step( - self.model, - is_dummy, - is_profile, - log_stats=self.vllm_config.parallel_config.eplb_config. - log_balancedness, - ) - def load_model(self, target_model: nn.Module, eep_scale_up: bool = False) -> None: - global_expert_load, old_global_expert_indices, rank_mapping = \ - EplbState.get_epp_state(self.vllm_config.parallel_config, - eep_scale_up) - from vllm.compilation.backends import set_model_tag with set_model_tag("medusa_head"): self.model = get_model(vllm_config=self.vllm_config, model_config=self.vllm_config. speculative_config.draft_model_config) - if is_mixture_of_experts( - self.model) and self.vllm_config.parallel_config.enable_eplb: - logger.info( - "EPLB is enabled for Eagle drafter model %s.", - self.vllm_config.speculative_config.draft_model_config.model) - - self.eplb_state = EplbState.build( - self.model, - self.device, - self.vllm_config.parallel_config, - global_expert_load, - old_global_expert_indices, - rank_mapping, - ) + assert not is_mixture_of_experts(self.model) @torch.inference_mode() def dummy_run(self, @@ -111,5 +75,3 @@ def dummy_run(self, with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): self.model(hidden_states) - if not skip_eplb: - self.eplb_step(is_dummy=True, is_profile=is_profile) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 572afcb38352..fd4396bfcb49 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2542,6 +2542,10 @@ def load_model(self, eep_scale_up: bool = False) -> None: if hasattr(self, "drafter"): logger.info("Loading drafter model...") self.drafter.load_model(self.model, eep_scale_up=eep_scale_up) + if hasattr(self.drafter, "eplb_state"): + assert hasattr(self.drafter, "model") and \ + is_mixture_of_experts(self.drafter.model) + self.drafter_eplb_state = self.drafter.eplb_state if self.use_aux_hidden_state_outputs: if supports_eagle3(self.model): self.model.set_aux_hidden_state_layers( @@ -2556,7 +2560,6 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.model_memory_usage / GiB_bytes, time_after_load - time_before_load) prepare_communication_buffer_for_model(self.model) - if is_mixture_of_experts( self.model) and self.parallel_config.enable_eplb: logger.info("EPLB is enabled for model %s.", diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 8b1e1bb8f45c..d4db23eedcb4 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -576,27 +576,47 @@ def _reconfigure_moe(self, old_ep_size: int, from vllm.distributed.parallel_state import ( get_dp_group, get_ep_group, prepare_communication_buffer_for_model) from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoEParallelConfig) - + FusedMoE, FusedMoEParallelConfig) parallel_config = self.vllm_config.parallel_config - moe_modules = [ + + def update_moe_modules(moe_modules: list[FusedMoE]): + assert all( + module.moe_config.num_local_experts == num_local_experts + for module in moe_modules), ( + "All MoE modules must have the same number of experts") + for module in moe_modules: + module.moe_config.num_experts = num_local_experts * new_ep_size + module.global_num_experts = module.moe_config.num_experts + module.moe_parallel_config = FusedMoEParallelConfig.make( + tp_size_=get_tp_group().world_size, + dp_size_=get_dp_group().world_size, + vllm_parallel_config=parallel_config, + ) + module.moe_config.moe_parallel_config = \ + module.moe_parallel_config + + model_moe_modules = [ module for module in self.model_runner.model.modules() if (module.__class__.__name__ == "FusedMoE" or module.__class__.__name__ == "SharedFusedMoE") ] - num_local_experts = moe_modules[0].moe_config.num_local_experts - assert all(module.moe_config.num_local_experts == num_local_experts - for module in moe_modules), ( - "All MoE modules must have the same number of experts") - for module in moe_modules: - module.moe_config.num_experts = num_local_experts * new_ep_size - module.global_num_experts = module.moe_config.num_experts - module.moe_parallel_config = FusedMoEParallelConfig.make( - tp_size_=get_tp_group().world_size, - dp_size_=get_dp_group().world_size, - vllm_parallel_config=parallel_config, - ) - module.moe_config.moe_parallel_config = module.moe_parallel_config + num_local_experts = model_moe_modules[0].moe_config.num_local_experts + + update_moe_modules(model_moe_modules) + if hasattr(self.model_runner, "drafter") and hasattr( + self.model_runner, "drafter_eplb_state") and \ + hasattr(self.model_runner.drafter, "model"): + drafter_moe_modules = [ + module for module in self.model_runner.drafter.model.modules() + if (module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE") + ] + # Check if drafter and model have matching configs + assert drafter_moe_modules[ + 0].moe_config.num_local_experts == num_local_experts, \ + "Drafter and model configs should be the same" + update_moe_modules(drafter_moe_modules) + if new_ep_size < old_ep_size: num_local_physical_experts = num_local_experts assert self.model_runner.eplb_state is not None @@ -621,6 +641,10 @@ def _reconfigure_moe(self, old_ep_size: int, parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - global_expert_load.shape[1]) prepare_communication_buffer_for_model(self.model_runner.model) + if hasattr(self.model_runner, "drafter") and \ + hasattr(self.model_runner.drafter, "model"): + prepare_communication_buffer_for_model( + self.model_runner.drafter.model) self.model_runner.model.update_physical_experts_metadata( num_physical_experts=new_physical_experts, num_local_physical_experts=num_local_physical_experts) From cda869d3f605f9d16b50deb71e81fa4fa669cddf Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 22 Sep 2025 12:29:59 -0700 Subject: [PATCH 04/32] Add eplb support to Llama4 Signed-off-by: ilmarkov --- vllm/model_executor/models/llama.py | 5 +- vllm/model_executor/models/llama4.py | 90 +++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1b03cbef501b..eb6ec8278fa9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -34,7 +34,7 @@ from vllm.attention import Attention, AttentionType from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -242,6 +242,7 @@ def __init__( config: LlamaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + parallel_config: Optional[ParallelConfig] = None, prefix: str = "", ) -> None: super().__init__() @@ -338,6 +339,7 @@ def __init__(self, cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + parallel_config = vllm_config.parallel_config self.config = config self.quant_config = quant_config @@ -360,6 +362,7 @@ def __init__(self, lambda prefix: layer_type(config=config, cache_config=cache_config, quant_config=quant_config, + parallel_config=parallel_config, prefix=prefix), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index ddd7e6a5936e..77cce829c924 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -27,8 +27,8 @@ from vllm.attention import Attention from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.config import CacheConfig, ParallelConfig, VllmConfig +from vllm.distributed import get_ep_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -39,6 +39,7 @@ from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.interfaces import MixtureOfExperts from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk, @@ -62,10 +63,14 @@ def custom_routing_function( def __init__(self, config: Llama4TextConfig, quant_config: Optional[QuantizationConfig] = None, + parallel_config: Optional[ParallelConfig] = None, prefix: str = ""): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() self.top_k = config.num_experts_per_tok + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() intermediate_size_moe = config.intermediate_size self.router = ReplicatedLinear(config.hidden_size, @@ -84,6 +89,21 @@ def __init__(self, reduce_results=False, ) + # Load balancing settings. + eplb_config = parallel_config.eplb_config if parallel_config else None + self.enable_eplb = parallel_config.enable_eplb \ + if parallel_config else False + self.n_redundant_experts = eplb_config.num_redundant_experts \ + if eplb_config else 0 + + self.n_routed_experts: int = config.num_local_experts + self.n_logical_experts = self.n_routed_experts + self.n_shared_experts: int = 1 + self.n_local_experts: int = config.num_local_experts + self.n_physical_experts = (self.n_local_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + self.experts = SharedFusedMoE( shared_experts=self.shared_expert, num_experts=config.num_local_experts, @@ -96,6 +116,8 @@ def __init__(self, renormalize=False, quant_config=quant_config, prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, ) def forward(self, hidden_states): @@ -262,6 +284,7 @@ def __init__( config: Llama4TextConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + parallel_config: Optional[ParallelConfig] = None, prefix: str = "", ) -> None: super().__init__() @@ -293,6 +316,7 @@ def __init__( self.feed_forward = Llama4MoE( config=config, quant_config=quant_config, + parallel_config=parallel_config, prefix=f"{prefix}.feed_forward", ) else: @@ -641,7 +665,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Llama4ForCausalLM(LlamaForCausalLM): +class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], @@ -663,6 +687,66 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=prefix, layer_type=Llama4DecoderLayer) + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_moe = None + for layer in self.model.layers: + assert isinstance(layer, Llama4DecoderLayer) + if isinstance(layer.feed_forward, Llama4MoE): + # Pick last one layer since the first ones may be dense layers. + example_moe = layer.feed_forward + self.moe_layers.append(layer.feed_forward.experts) + + if example_moe is None: + raise RuntimeError("No Llama4MoE layer found in model.layers.") + + # Set MoE hyperparameters + self.num_moe_layers = len(self.moe_layers) + print(f"num_moe_layers: {self.num_moe_layers}") + self.num_expert_groups = 1 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + print(f"set eplb state layer_idx: {layer_idx}") + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.feed_forward, Llama4MoE): + moe = layer.feed_forward + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def _init_model(self, vllm_config: VllmConfig, prefix: str = "", From 7a519ee0437212998647e6f2707ab43b200cfb8d Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 23 Sep 2025 02:48:18 -0700 Subject: [PATCH 05/32] Fix mllama4 Signed-off-by: ilmarkov --- vllm/model_executor/models/llama4.py | 2 -- vllm/model_executor/models/mllama4.py | 32 ++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 77cce829c924..da5c06a1ddf4 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -703,7 +703,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Set MoE hyperparameters self.num_moe_layers = len(self.moe_layers) - print(f"num_moe_layers: {self.num_moe_layers}") self.num_expert_groups = 1 self.num_logical_experts = example_moe.n_logical_experts self.num_physical_experts = example_moe.n_physical_experts @@ -721,7 +720,6 @@ def set_eplb_state( for layer_idx, layer in enumerate(self.moe_layers): # Register the expert weights. self.expert_weights.append(layer.get_expert_weights()) - print(f"set eplb state layer_idx: {layer_idx}") layer.set_eplb_state( moe_layer_idx=layer_idx, expert_load_view=expert_load_view, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 131a66b71323..3cc9b32ecd87 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -54,7 +54,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MixtureOfExperts, MultiModalEmbeddings, + SupportsMultiModal, SupportsPP) from .llama4 import Llama4ForCausalLM from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) @@ -708,8 +709,8 @@ def get_dummy_mm_data( info=Mllama4ProcessingInfo, dummy_inputs=Mllama4DummyInputsBuilder, ) -class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): +class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, + MixtureOfExperts): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -758,6 +759,31 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) + # Set MoE hyperparameters + self.num_expert_groups = 1 + self.num_logical_experts = self.language_model.num_logical_experts + self.num_physical_experts = self.language_model.num_physical_experts + self.num_local_physical_experts = \ + self.language_model.num_local_physical_experts + self.num_routed_experts = self.language_model.num_routed_experts + self.num_shared_experts = self.language_model.num_shared_experts + self.num_redundant_experts = self.language_model.num_redundant_experts + self.moe_layers = self.language_model.moe_layers + self.num_moe_layers = len(self.moe_layers) + + def set_eplb_state(self, expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor): + self.language_model.set_eplb_state(expert_load_view, + logical_to_physical_map, + logical_replica_count) + self.expert_weights = self.language_model.expert_weights + + def update_physical_experts_metadata(self, num_physical_experts: int, + num_local_physical_experts: int): + self.language_model.update_physical_experts_metadata( + num_physical_experts, num_local_physical_experts) + def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]: # num_images, 1, num_chunks, channel, image_size, image_size From ec2b02a86baf1be999450cbed72753cf949baf76 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 23 Sep 2025 12:38:05 -0700 Subject: [PATCH 06/32] Refactor multi model eplb support Signed-off-by: ilmarkov --- vllm/distributed/eplb/eplb_state.py | 535 +++++++++++++++++----------- vllm/v1/spec_decode/eagle.py | 47 +-- vllm/v1/worker/gpu_model_runner.py | 59 ++- vllm/v1/worker/gpu_worker.py | 31 +- 4 files changed, 383 insertions(+), 289 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 93ad2bd569ff..791a8f0bf567 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -34,7 +34,7 @@ import torch from torch.distributed import ProcessGroup, all_reduce -from vllm.config import ParallelConfig +from vllm.config import ModelConfig, ParallelConfig from vllm.distributed.parallel_state import (get_ep_group, get_node_count, in_the_same_node_as) from vllm.distributed.utils import StatelessProcessGroup @@ -48,7 +48,7 @@ @dataclass -class EplbState: +class EplbModelState: """EPLB metrics.""" physical_to_logical_map: torch.Tensor @@ -128,34 +128,46 @@ class EplbState: See: https://github.com/vllm-project/vllm/pull/22167#pullrequestreview-3086143856 """ - expert_load_window_step: int = 0 - """ - Current step in the sliding window. + model_name: str + model: MixtureOfExperts - Different from `expert_rearrangement_step`, each EP rank may have its own - `expert_load_window_step`. - """ - expert_load_window_size: int = 0 + +class EplbState: """ - Size of the expert load sliding window. - This is a constant and is taken from the config. + EplbState of each expert parallel model. Key is the model config hash. """ - expert_rearrangement_step: int = 0 - """ - Steps after last rearrangement. - Will trigger a rearrangement if it exceeds the threshold. + def __init__(self, parallel_config: ParallelConfig, device: torch.device): + self.parallel_config = parallel_config + self.device = device + self.model_states: dict[str, EplbModelState] = {} + """ + Current step in the sliding window. - NOTE: Keep in mind that all EP ranks need to have the same - `expert_rearrangement_step` value to ensure synchronization. - Otherwise, the rearrangement will hang at collective - communication calls. - """ - expert_rearrangement_step_interval: int = 0 - """ - Interval for expert rearrangement steps. - This is a constant and is taken from the config. - """ + Different from `expert_rearrangement_step`, + each EP rank may have its own `expert_load_window_step`. + """ + self.expert_load_window_step: int = 0 + """ + Size of the expert load sliding window. + This is a constant and is taken from the config. + """ + self.expert_load_window_size: int = 0 + """ + Steps after last rearrangement. + Will trigger a rearrangement if it exceeds the threshold. + + NOTE: Keep in mind that all EP ranks need to have the same + `expert_rearrangement_step` value to ensure synchronization. + Otherwise, the rearrangement will hang at collective + communication calls. + """ + self.expert_rearrangement_step: int = 0 + """ + Interval for expert rearrangement steps. + This is a constant and is taken from the config. + """ + self.expert_rearrangement_step_interval: int = 0 @staticmethod def build_initial_global_physical_to_logical_map( @@ -177,27 +189,60 @@ def build_initial_global_physical_to_logical_map( ] return global_physical_to_logical_map - @classmethod - def build( - cls, + def validate_ep_configuration(self, new_model: MixtureOfExperts): + """ + Validate that the expert parallel configuration of + the new model is the same as the existing models. + """ + if len(self.model_states) > 0: + model = next(iter(self.model_states.values())).model + if (model.num_routed_experts != new_model.num_routed_experts + or model.num_redundant_experts + != new_model.num_redundant_experts + or model.num_physical_experts + != new_model.num_physical_experts or + model.num_logical_experts != new_model.num_logical_experts + or model.num_expert_groups != new_model.num_expert_groups): + raise RuntimeError("Model: {} " + "with config {} " + "{} {} {} {} " + "mismatch with new model {} " + "with config {} " + "{} {} {} {}".format( + type(model), + model.num_routed_experts, + model.num_redundant_experts, + model.num_physical_experts, + model.num_logical_experts, + model.num_expert_groups, + type(new_model), + new_model.num_routed_experts, + new_model.num_redundant_experts, + new_model.num_physical_experts, + new_model.num_logical_experts, + new_model.num_expert_groups, + )) + + def add_model( + self, model: MixtureOfExperts, - device: torch.device, - parallel_config: ParallelConfig, + model_config: ModelConfig, global_expert_load: Optional[torch.Tensor] = None, - old_global_expert_indices: Optional[torch.Tensor] = None, + old_global_expert_indices: Optional[list[torch.Tensor]] = None, rank_mapping: Optional[dict[int, int]] = None, - ) -> "EplbState": + ): """ Build the initial EPLB state. """ + self.validate_ep_configuration(model) physical_to_logical_map_list = ( - cls.build_initial_global_physical_to_logical_map( + EplbState.build_initial_global_physical_to_logical_map( model.num_routed_experts, model.num_redundant_experts, )) physical_to_logical_map = torch.tensor( physical_to_logical_map_list, - device=device, + device=self.device, ) # Assuming 8 GPUs per node, this supports up to # (1023 + 1) / 8 = 128 nodes for now. @@ -210,11 +255,11 @@ def build( logical_to_physical_map = torch.full( (model.num_logical_experts, max_slots_per_logical_expert), -1, - device=device, + device=self.device, ) logical_replica_count = torch.zeros( (model.num_logical_experts, ), - device=device, + device=self.device, dtype=torch.long, ) @@ -225,42 +270,49 @@ def build( logical_replica_count[logical_idx] += 1 # Duplicate initial mapping for all layers - physical_to_logical_map = physical_to_logical_map.unsqueeze(0).expand( + physical_to_logical_map = (physical_to_logical_map.unsqueeze(0).expand( model.num_moe_layers, -1, - ).contiguous() - logical_to_physical_map = logical_to_physical_map.unsqueeze(0).expand( + ).contiguous()) + logical_to_physical_map = (logical_to_physical_map.unsqueeze(0).expand( model.num_moe_layers, -1, -1, - ).contiguous() - logical_replica_count = logical_replica_count.unsqueeze(0).expand( + ).contiguous()) + logical_replica_count = (logical_replica_count.unsqueeze(0).expand( model.num_moe_layers, -1, - ).contiguous() + ).contiguous()) expert_load_pass = torch.zeros( (model.num_moe_layers, model.num_physical_experts), dtype=torch.int32, - device=device, + device=self.device, ) - expert_load_window_size = parallel_config.eplb_config.window_size + self.expert_load_window_size = \ + self.parallel_config.eplb_config.window_size expert_load_window = torch.zeros( - (expert_load_window_size, model.num_moe_layers, - model.num_physical_experts), + ( + self.expert_load_window_size, + model.num_moe_layers, + model.num_physical_experts, + ), dtype=torch.int32, - device=device, + device=self.device, ) # Set the initial progress of rearrangement to 3/4 - eplb_step_interval = parallel_config.eplb_config.step_interval - expert_rearrangement_step = max( + eplb_step_interval = self.parallel_config.eplb_config.step_interval + self.expert_rearrangement_step = max( 0, eplb_step_interval - eplb_step_interval // 4) + self.expert_rearrangement_step_interval = eplb_step_interval if global_expert_load is not None: ep_group = get_ep_group().device_group - assert global_expert_load.shape == (model.num_moe_layers, - model.num_logical_experts) + assert global_expert_load.shape == ( + model.num_moe_layers, + model.num_logical_experts, + ) assert global_expert_load.dtype == torch.int64 num_replicas = model.num_physical_experts @@ -280,13 +332,13 @@ def build( new_physical_to_logical_map, new_logical_to_physical_map, new_logical_replica_count, - ) = (rebalance_experts( + ) = rebalance_experts( global_expert_load, num_replicas, num_groups, num_nodes, num_gpus, - )) + ) max_physical_slots = new_logical_to_physical_map.shape[-1] assert max_physical_slots <= logical_to_physical_map.shape[-1] @@ -295,7 +347,8 @@ def build( (0, logical_to_physical_map.shape[-1] - max_physical_slots), value=-1, ) - physical_to_logical_map = new_physical_to_logical_map.to(device) + physical_to_logical_map = new_physical_to_logical_map.to( + self.device) logical_to_physical_map.copy_(new_logical_to_physical_map) logical_replica_count.copy_(new_logical_replica_count) @@ -313,24 +366,24 @@ def build( False, rank_mapping, ) - expert_rearrangement_step = 0 + self.expert_rearrangement_step = 0 - return cls( + self.model_states[model_config.compute_hash()] = EplbModelState( physical_to_logical_map, logical_to_physical_map, logical_replica_count, expert_load_pass, expert_load_window, - expert_load_window_size=expert_load_window_size, - expert_rearrangement_step=expert_rearrangement_step, - expert_rearrangement_step_interval=eplb_step_interval, + model_config.model, + model, ) - def step(self, - model: MixtureOfExperts, - is_dummy: bool = False, - is_profile: bool = False, - log_stats: bool = False) -> None: + def step( + self, + is_dummy: bool = False, + is_profile: bool = False, + log_stats: bool = False, + ) -> None: """ Step the EPLB state. @@ -353,52 +406,66 @@ def step(self, """ if is_profile: - self.rearrange(model, is_profile=True) + self.rearrange(is_profile=True) return if is_dummy: # Do not record load metrics for dummy steps - self.expert_load_pass.zero_() + for eplb_model_state in self.model_states.values(): + eplb_model_state.expert_load_pass.zero_() if log_stats: # total_expert_load_pass: (num_moe_layers, num_physical_experts) - total_expert_load_pass = self.expert_load_pass.clone() - - # Collect load metrics from all ranks - ep_group = get_ep_group().device_group - all_reduce(total_expert_load_pass, group=ep_group) - - # num_tokens_per_rank: (num_moe_layers, num_ranks) - num_tokens_per_rank = total_expert_load_pass.reshape( - total_expert_load_pass.shape[0], ep_group.size(), - -1).sum(dim=-1).float() - - # Compute balancedness ratio: - # for each layer: - # (mean load across ranks) / (max load across ranks) - avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0) - max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum( - dim=0) - - # Just to make type checker happy - tokens_tensors: list[float] = torch.stack( - [avg_tokens_tensor, max_tokens_tensor]).tolist() - avg_tokens, max_tokens = tokens_tensors - balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0 - - if ep_group.rank() == 0: - logger.info( - "EPLB step: avg_tokens=%.2f, max_tokens=%d, " - "balancedness=%.4f", avg_tokens, max_tokens, balancedness) + for eplb_model_state in self.model_states.values(): + total_expert_load_pass = ( + eplb_model_state.expert_load_pass.clone()) + + # Collect load metrics from all ranks + ep_group = get_ep_group().device_group + all_reduce(total_expert_load_pass, group=ep_group) + + # num_tokens_per_rank: (num_moe_layers, num_ranks) + num_tokens_per_rank = (total_expert_load_pass.reshape( + total_expert_load_pass.shape[0], ep_group.size(), + -1).sum(dim=-1).float()) + + # Compute balancedness ratio: + # for each layer: + # (mean load across ranks) / (max load across ranks) + avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0) + max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum( + dim=0) + + # Just to make type checker happy + tokens_tensors: list[float] = torch.stack( + [avg_tokens_tensor, max_tokens_tensor]).tolist() + avg_tokens, max_tokens = tokens_tensors + balancedness = (avg_tokens / + max_tokens if max_tokens > 0 else 0.0) + + if ep_group.rank() == 0: + logger.info( + "EPLB step for model %s: avg_tokens=%.2f, " + "max_tokens=%d, balancedness=%.4f", + eplb_model_state.model_name, + avg_tokens, + max_tokens, + balancedness, + ) # Update the expert load sliding window if not is_dummy: - self.expert_load_window[self.expert_load_window_step] = ( - self.expert_load_pass.clone()) + for eplb_model_state in self.model_states.values(): + eplb_model_state.expert_load_window[ + self. + expert_load_window_step] = \ + eplb_model_state.expert_load_pass.clone( + ) + eplb_model_state.expert_load_pass.zero_() + self.expert_load_window_step += 1 if self.expert_load_window_step >= self.expert_load_window_size: self.expert_load_window_step = 0 - self.expert_load_pass.zero_() # Step the expert rearrangement step # Note that even if this is a dummy step, we still increment the @@ -408,16 +475,15 @@ def step(self, if (self.expert_rearrangement_step >= self.expert_rearrangement_step_interval): self.expert_rearrangement_step = 0 - self.rearrange(model) + self.rearrange() def rearrange( self, - model: MixtureOfExperts, is_profile: bool = False, execute_shuffle: bool = True, - global_expert_load: Optional[torch.Tensor] = None, - rank_mapping: Optional[dict[int, - int]] = None) -> Optional[torch.Tensor]: + global_expert_loads: Optional[list[torch.Tensor]] = None, + rank_mapping: Optional[dict[int, int]] = None, + ) -> Optional[list[torch.Tensor]]: """ Rearrange the experts according to the current load. """ @@ -433,51 +499,69 @@ def rearrange( logger.info("Rearranging experts %s...", "(profile)" if is_profile else "") - if global_expert_load is None: + if global_expert_loads is None: # Map the physical expert load to global logical experts - logical_expert_load_window = torch.zeros( - self.expert_load_window_size, - model.num_moe_layers, - model.num_logical_experts, - dtype=self.expert_load_window.dtype, - device=self.expert_load_window.device, - ) - logical_expert_load_window.scatter_add_( - dim=-1, - index=self.physical_to_logical_map.unsqueeze(0).expand_as( - self.expert_load_window).long(), - src=self.expert_load_window, - ) - + global_expert_load_windows = [] if not execute_shuffle: - metadata = torch.tensor( - [ - model.num_moe_layers, model.num_logical_experts, - self.physical_to_logical_map.shape[1] - ], - dtype=torch.int32, - device="cpu", - ) - torch.distributed.broadcast(metadata, + num_models = torch.tensor([len(self.model_states)], + dtype=torch.int32, + device="cpu") + torch.distributed.broadcast(num_models, group=get_ep_group().cpu_group, group_src=0) - # Perform all-reduce to get the expert load across all ranks - global_expert_load_window = logical_expert_load_window.sum(dim=0) - all_reduce(global_expert_load_window, group=ep_group) + for eplb_model_state in self.model_states.values(): + logical_expert_load_window = torch.zeros( + self.expert_load_window_size, + eplb_model_state.model.num_moe_layers, + eplb_model_state.model.num_logical_experts, + dtype=eplb_model_state.expert_load_window.dtype, + device=eplb_model_state.expert_load_window.device, + ) + logical_expert_load_window.scatter_add_( + dim=-1, + index=eplb_model_state.physical_to_logical_map.unsqueeze( + 0).expand_as( + eplb_model_state.expert_load_window).long(), + src=eplb_model_state.expert_load_window, + ) + if not execute_shuffle: + metadata = torch.tensor( + [ + eplb_model_state.model.num_moe_layers, + eplb_model_state.model.num_logical_experts, + eplb_model_state.physical_to_logical_map.shape[1], + ], + dtype=torch.int32, + device="cpu", + ) + torch.distributed.broadcast(metadata, + group=get_ep_group().cpu_group, + group_src=0) + + # Perform all-reduce to get the expert load across all ranks + global_expert_load_window = logical_expert_load_window.sum( + dim=0) + all_reduce(global_expert_load_window, group=ep_group) + + if not execute_shuffle: + # (num_moe_layers, old_num_physical_experts) + old_global_expert_indices = \ + eplb_model_state.physical_to_logical_map + torch.distributed.broadcast(old_global_expert_indices, + group=ep_group, + group_src=0) + global_expert_load_windows.append(global_expert_load_window) if not execute_shuffle: - # (num_moe_layers, old_num_physical_experts) - old_global_expert_indices = self.physical_to_logical_map - torch.distributed.broadcast(old_global_expert_indices, - group=ep_group, - group_src=0) - return global_expert_load_window + return global_expert_load_windows else: assert execute_shuffle - global_expert_load_window = global_expert_load + global_expert_load_windows = global_expert_loads # TODO(bowen): Treat differently for prefill and decode nodes + eplb_model_state = next(iter(self.model_states.values())) + model = eplb_model_state.model num_replicas = model.num_physical_experts num_groups = model.num_expert_groups if rank_mapping is not None and len(rank_mapping) == ep_group.size(): @@ -488,8 +572,8 @@ def rearrange( num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping) num_gpus = sum(new_rank != -1 for new_rank in rank_mapping.values()) - num_replicas = num_replicas // ep_group.size( - ) * num_gpus # handle num replicas change + num_replicas = (num_replicas // ep_group.size() * num_gpus + ) # handle num replicas change else: num_nodes = get_node_count() num_gpus = ep_group.size() @@ -501,46 +585,56 @@ def rearrange( "not using hierarchical rearrangement algorithm.\n" f"{num_gpus=}, {num_nodes=}") - # Get new expert mappings - ( - new_physical_to_logical_map, - new_logical_to_physical_map, - new_logical_replica_count, - ) = (rebalance_experts( - global_expert_load_window, - num_replicas, - num_groups, - num_nodes, - num_gpus, - )) - - # Update expert weights - rearrange_expert_weights_inplace( - self.physical_to_logical_map, - new_physical_to_logical_map, - model.expert_weights, - ep_group, - is_profile, - rank_mapping, - ) - - if not is_profile: - if self.physical_to_logical_map.shape[ - 1] != new_physical_to_logical_map.shape[1]: - self.physical_to_logical_map = new_physical_to_logical_map.to( - self.physical_to_logical_map.device) - else: - self.physical_to_logical_map.copy_(new_physical_to_logical_map) - max_physical_slots = new_logical_to_physical_map.shape[-1] - assert max_physical_slots <= self.logical_to_physical_map.shape[-1] - new_logical_to_physical_map = torch.nn.functional.pad( + for eplb_model_state, global_expert_load_window in zip( + self.model_states.values(), global_expert_load_windows): + # Get new expert mappings for the model + ( + new_physical_to_logical_map, new_logical_to_physical_map, - (0, - self.logical_to_physical_map.shape[-1] - max_physical_slots), - value=-1, + new_logical_replica_count, + ) = rebalance_experts( + global_expert_load_window, + num_replicas, + num_groups, + num_nodes, + num_gpus, ) - self.logical_to_physical_map.copy_(new_logical_to_physical_map) - self.logical_replica_count.copy_(new_logical_replica_count) + + # Update expert weights + rearrange_expert_weights_inplace( + eplb_model_state.physical_to_logical_map, + new_physical_to_logical_map, + eplb_model_state.model.expert_weights, + ep_group, + is_profile, + rank_mapping, + ) + + if not is_profile: + if (eplb_model_state.physical_to_logical_map.shape[1] + != new_physical_to_logical_map.shape[1]): + eplb_model_state.physical_to_logical_map = ( + new_physical_to_logical_map.to( + eplb_model_state.physical_to_logical_map.device)) + else: + eplb_model_state.physical_to_logical_map.copy_( + new_physical_to_logical_map) + max_physical_slots = new_logical_to_physical_map.shape[-1] + assert (max_physical_slots + <= eplb_model_state.logical_to_physical_map.shape[-1]) + new_logical_to_physical_map = torch.nn.functional.pad( + new_logical_to_physical_map, + ( + 0, + eplb_model_state.logical_to_physical_map.shape[-1] - + max_physical_slots, + ), + value=-1, + ) + eplb_model_state.logical_to_physical_map.copy_( + new_logical_to_physical_map) + eplb_model_state.logical_replica_count.copy_( + new_logical_replica_count) if is_main_rank: assert time_start is not None @@ -554,65 +648,88 @@ def rearrange( return None @staticmethod - def recv_state() -> tuple[torch.Tensor, torch.Tensor]: + def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]: """ Receive the expert load and old placement from the master rank. """ ep_group = get_ep_group() - metadata = torch.empty(3, dtype=torch.int32, device="cpu") - torch.distributed.broadcast(metadata, + num_models = torch.empty(1, dtype=torch.int32, device="cpu") + torch.distributed.broadcast(num_models, group=ep_group.cpu_group, group_src=0) - num_moe_layers, num_logical_experts, num_old_physical_experts = ( - metadata.tolist()) - global_expert_load = torch.zeros( - (num_moe_layers, num_logical_experts), - dtype=torch.int64, - device=ep_group.device, - ) - all_reduce(global_expert_load, group=ep_group.device_group) - old_global_expert_indices = torch.empty( - (num_moe_layers, num_old_physical_experts), - dtype=torch.int64, - device=ep_group.device, - ) - torch.distributed.broadcast(old_global_expert_indices, - group=ep_group.device_group, - group_src=0) - - return global_expert_load, old_global_expert_indices + num_models = num_models.item() + global_expert_loads = [] + old_global_expert_indices_per_model = [] + for _ in range(num_models): + metadata = torch.empty(3, dtype=torch.int32, device="cpu") + torch.distributed.broadcast(metadata, + group=ep_group.cpu_group, + group_src=0) + num_moe_layers, num_logical_experts, num_old_physical_experts = ( + metadata.tolist()) + global_expert_load = torch.zeros( + (num_moe_layers, num_logical_experts), + dtype=torch.int64, + device=ep_group.device, + ) + all_reduce(global_expert_load, group=ep_group.device_group) + old_global_expert_indices = torch.empty( + (num_moe_layers, num_old_physical_experts), + dtype=torch.int64, + device=ep_group.device, + ) + torch.distributed.broadcast( + old_global_expert_indices, + group=ep_group.device_group, + group_src=0, + ) + global_expert_loads.append(global_expert_load) + old_global_expert_indices_per_model.append( + old_global_expert_indices) + return global_expert_loads, old_global_expert_indices_per_model @classmethod def get_epp_state( cls, parallel_config: ParallelConfig, eep_scale_up: bool = False - ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[dict[ - int, int]]]: + ) -> tuple[ + Optional[list[torch.Tensor]], + Optional[list[torch.Tensor]], + Optional[dict[int, int]], + ]: if not eep_scale_up: return None, None, None num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu") - torch.distributed.broadcast(num_local_physical_experts, - group=get_ep_group().cpu_group, - group_src=0) + torch.distributed.broadcast( + num_local_physical_experts, + group=get_ep_group().cpu_group, + group_src=0, + ) num_local_physical_experts = int(num_local_physical_experts.item()) new_ep_size = get_ep_group().world_size - global_expert_load, old_global_expert_indices = ( + global_expert_loads, old_global_expert_indices_per_model = ( EplbState.recv_state()) - num_logical_experts = global_expert_load.shape[1] + + # EP configuration for all models has to be the same so as eplb config + num_logical_experts = global_expert_loads[0].shape[1] parallel_config.eplb_config.num_redundant_experts = ( num_local_physical_experts * new_ep_size - num_logical_experts) - assert old_global_expert_indices.shape[ - 1] % num_local_physical_experts == 0 - old_ep_size = old_global_expert_indices.shape[ - 1] // num_local_physical_experts + assert (old_global_expert_indices_per_model[0].shape[1] % + num_local_physical_experts == 0) + old_ep_size = (old_global_expert_indices_per_model[0].shape[1] // + num_local_physical_experts) rank_mapping = { old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size) } - return global_expert_load, old_global_expert_indices, rank_mapping + return ( + global_expert_loads, + old_global_expert_indices_per_model, + rank_mapping, + ) def _node_count_with_rank_mapping( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 2beb0537c99c..bf8a83321178 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -18,7 +18,6 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal -from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -376,9 +375,6 @@ def propose( draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) - # EPLB step - self.eplb_step() - # [batch_size, num_speculative_tokens] draft_token_ids = torch.stack(draft_token_ids_list, dim=1) return draft_token_ids @@ -806,13 +802,7 @@ def prepare_inputs( return spec_common_attn_metadata, token_indices - def load_model(self, - target_model: nn.Module, - eep_scale_up: bool = False) -> None: - global_expert_load, old_global_expert_indices, rank_mapping = \ - EplbState.get_epp_state(self.vllm_config.parallel_config, - eep_scale_up) - + def load_model(self, target_model: nn.Module) -> None: draft_model_config = \ self.vllm_config.speculative_config.draft_model_config target_attn_layer_names = set( @@ -873,39 +863,6 @@ def load_model(self, logger.info("Loading EAGLE LM head weights from the target model.") self.model.lm_head = target_language_model.lm_head - if is_mixture_of_experts( - self.model) and self.vllm_config.parallel_config.enable_eplb: - logger.info("EPLB is enabled for Eagle drafter model %s.", - draft_model_config.model) - - self.eplb_state = EplbState.build( - self.model, - self.device, - self.vllm_config.parallel_config, - global_expert_load, - old_global_expert_indices, - rank_mapping, - ) - - def eplb_step(self, - is_dummy: bool = False, - is_profile: bool = False) -> None: - """ - Step for the EPLB (Expert Parallelism Load Balancing) state. - """ - if not self.vllm_config.parallel_config.enable_eplb or \ - not is_mixture_of_experts(self.model): - return - - assert self.eplb_state is not None - self.eplb_state.step( - self.model, - is_dummy, - is_profile, - log_stats=self.vllm_config.parallel_config.eplb_config. - log_balancedness, - ) - @torch.inference_mode() def dummy_run( self, @@ -928,8 +885,6 @@ def dummy_run( hidden_states=self.hidden_states[:num_tokens], inputs_embeds=inputs_embeds, ) - if not skip_eplb: - self.eplb_step(is_dummy=True, is_profile=is_profile) def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fd4396bfcb49..32d5b9d5a862 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1726,7 +1726,6 @@ def eplb_step(self, model = self.get_model() assert is_mixture_of_experts(model) self.eplb_state.step( - model, is_dummy, is_profile, log_stats=self.parallel_config.eplb_config.log_balancedness, @@ -2380,6 +2379,8 @@ def propose_draft_token_ids( else: indices = [] offset = 0 + assert spec_decode_metadata is not None, \ + "No spec decode metadata for medusa" for num_draft, tokens in zip( spec_decode_metadata.num_draft_tokens, sampled_token_ids): @@ -2429,7 +2430,7 @@ def propose_draft_token_ids( target_token_ids = self.input_ids.gpu[:num_scheduled_tokens] # TODO(woosuk): Support M-RoPE. target_positions = self.positions.gpu[:num_scheduled_tokens] - if self.use_aux_hidden_state_outputs: + if self.use_aux_hidden_state_outputs and aux_hidden_states: target_hidden_states = torch.cat( [h[:num_scheduled_tokens] for h in aux_hidden_states], dim=-1) @@ -2454,7 +2455,7 @@ def propose_draft_token_ids( target_token_ids = self.input_ids.gpu[token_indices] # TODO(woosuk): Support M-RoPE. target_positions = self.positions.gpu[token_indices] - if self.use_aux_hidden_state_outputs: + if self.use_aux_hidden_state_outputs and aux_hidden_states: target_hidden_states = torch.cat( [h[token_indices] for h in aux_hidden_states], dim=-1) else: @@ -2527,9 +2528,13 @@ def load_model(self, eep_scale_up: bool = False) -> None: eep_scale_up: the model loading is for elastic EP scale up. """ logger.info("Starting to load model %s...", self.model_config.model) - global_expert_load, old_global_expert_indices, rank_mapping = \ - EplbState.get_epp_state(self.parallel_config, eep_scale_up) + global_expert_loads, old_global_expert_indices_per_model, \ + rank_mapping = EplbState.get_epp_state( + self.parallel_config.eep_scale_up) + if self.parallel_config.enable_eplb: + self.eplb_state = EplbState(self.parallel_config, self.device) + eplb_models = 0 with DeviceMemoryProfiler() as m: time_before_load = time.perf_counter() model_loader = get_model_loader(self.load_config) @@ -2541,11 +2546,29 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.device) if hasattr(self, "drafter"): logger.info("Loading drafter model...") - self.drafter.load_model(self.model, eep_scale_up=eep_scale_up) - if hasattr(self.drafter, "eplb_state"): - assert hasattr(self.drafter, "model") and \ - is_mixture_of_experts(self.drafter.model) - self.drafter_eplb_state = self.drafter.eplb_state + self.drafter.load_model(self.model) + if (hasattr(self.drafter, "model") + and is_mixture_of_experts(self.drafter.model) + and self.parallel_config.enable_eplb): + logger.info( + "EPLB is enabled for model %s.", self.vllm_config. + speculative_config.draft_model_config.model) + + global_expert_load = global_expert_loads[ + eplb_models] if global_expert_loads else None + old_global_expert_indices = \ + old_global_expert_indices_per_model[eplb_models] \ + if old_global_expert_indices_per_model else None + if self.eplb_state is None: + self.eplb_state = EplbState(self.parallel_config, + self.device) + self.eplb_state.add_model( + self.drafter.model, + self.vllm_config.speculative_config.draft_model_config, + global_expert_load, old_global_expert_indices, + rank_mapping) + eplb_models += 1 + if self.use_aux_hidden_state_outputs: if supports_eagle3(self.model): self.model.set_aux_hidden_state_layers( @@ -2564,14 +2587,14 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.model) and self.parallel_config.enable_eplb: logger.info("EPLB is enabled for model %s.", self.model_config.model) - self.eplb_state = EplbState.build( - self.model, - self.device, - self.parallel_config, - global_expert_load, - old_global_expert_indices, - rank_mapping, - ) + global_expert_load = global_expert_loads[ + eplb_models] if global_expert_loads else None + old_global_expert_indices = old_global_expert_indices_per_model[ + eplb_models] if old_global_expert_indices_per_model else None + assert self.eplb_state is not None + self.eplb_state.add_model(self.model, self.model_config, + global_expert_load, + old_global_expert_indices, rank_mapping) if ( self.vllm_config.compilation_config.level == \ diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d4db23eedcb4..da1c1d709620 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -21,6 +21,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed +from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -516,9 +517,8 @@ def _eplb_before_scale_down(self, old_ep_size: int, for old_ep_rank in range(old_ep_size) } assert self.model_runner.eplb_state is not None - self.model_runner.eplb_state.rearrange(self.model_runner.model, - execute_shuffle=True, - global_expert_load=None, + self.model_runner.eplb_state.rearrange(execute_shuffle=True, + global_expert_loads=None, rank_mapping=rank_mapping) torch.cuda.synchronize() if get_ep_group().rank == 0: @@ -526,7 +526,7 @@ def _eplb_before_scale_down(self, old_ep_size: int, def _eplb_after_scale_up( self, old_ep_size: int, new_ep_size: int, - global_expert_load: Optional[torch.Tensor]) -> None: + global_expert_loads: Optional[list[torch.Tensor]]) -> None: from vllm.distributed.parallel_state import get_ep_group if get_ep_group().rank == 0: logger.info("[Elastic EP] Starting expert resharding " @@ -537,9 +537,8 @@ def _eplb_after_scale_up( } assert self.model_runner.eplb_state is not None self.model_runner.eplb_state.rearrange( - self.model_runner.model, execute_shuffle=True, - global_expert_load=global_expert_load, + global_expert_loads=global_expert_loads, rank_mapping=rank_mapping) if get_ep_group().rank == 0: logger.info("[Elastic EP] Expert resharding completed!") @@ -604,8 +603,8 @@ def update_moe_modules(moe_modules: list[FusedMoE]): update_moe_modules(model_moe_modules) if hasattr(self.model_runner, "drafter") and hasattr( - self.model_runner, "drafter_eplb_state") and \ - hasattr(self.model_runner.drafter, "model"): + self.model_runner.drafter, "model") and \ + is_mixture_of_experts(self.model_runner.drafter.model): drafter_moe_modules = [ module for module in self.model_runner.drafter.model.modules() if (module.__class__.__name__ == "FusedMoE" @@ -625,7 +624,7 @@ def update_moe_modules(moe_modules: list[FusedMoE]): parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - self.model_runner.eplb_state.logical_replica_count.shape[1]) - global_expert_load = None + global_expert_loads = None else: num_local_physical_experts = torch.tensor([num_local_experts], dtype=torch.int32, @@ -636,10 +635,10 @@ def update_moe_modules(moe_modules: list[FusedMoE]): num_local_physical_experts = num_local_physical_experts.item() new_physical_experts = num_local_physical_experts * new_ep_size assert self.model_runner.eplb_state is not None - global_expert_load = self.model_runner.eplb_state.rearrange( - self.model_runner.model, execute_shuffle=False) + global_expert_loads = self.model_runner.eplb_state.rearrange( + execute_shuffle=False) parallel_config.eplb_config.num_redundant_experts = ( - new_physical_experts - global_expert_load.shape[1]) + new_physical_experts - global_expert_loads[0].shape[1]) prepare_communication_buffer_for_model(self.model_runner.model) if hasattr(self.model_runner, "drafter") and \ hasattr(self.model_runner.drafter, "model"): @@ -648,7 +647,7 @@ def update_moe_modules(moe_modules: list[FusedMoE]): self.model_runner.model.update_physical_experts_metadata( num_physical_experts=new_physical_experts, num_local_physical_experts=num_local_physical_experts) - return global_expert_load + return global_expert_loads def reinitialize_distributed( self, reconfig_request: ReconfigureDistributedRequest) -> None: @@ -678,12 +677,12 @@ def reinitialize_distributed( self.distributed_init_method, self.local_rank) - global_expert_load = self._reconfigure_moe(old_ep_size, new_ep_size) + global_expert_loads = self._reconfigure_moe(old_ep_size, new_ep_size) if new_ep_size > old_ep_size: - assert global_expert_load is not None + assert global_expert_loads is not None self._eplb_after_scale_up(old_ep_size, new_ep_size, - global_expert_load) + global_expert_loads) def save_sharded_state( self, From ca985440d831b83dfacad4dac5b57c635fa4bc18 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Wed, 24 Sep 2025 17:32:32 +0000 Subject: [PATCH 07/32] Add test and fix Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/e2e/test_eplb_spec_decode.py | 104 +++++++++++++++++++ vllm/model_executor/models/deepseek_eagle.py | 43 ++++++++ vllm/model_executor/models/deepseek_mtp.py | 36 +++++-- vllm/model_executor/models/deepseek_v2.py | 40 ++++--- vllm/model_executor/models/interfaces.py | 3 +- vllm/model_executor/models/llama4.py | 36 +++++-- vllm/model_executor/models/llama4_eagle.py | 2 + vllm/model_executor/models/minicpm.py | 4 + vllm/model_executor/models/qwen3_next_mtp.py | 8 +- vllm/v1/worker/gpu_model_runner.py | 8 +- 11 files changed, 244 insertions(+), 42 deletions(-) create mode 100644 tests/v1/e2e/test_eplb_spec_decode.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c4ea4b675649..39797d668df0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -260,7 +260,7 @@ steps: - pytest -v -s tokenization - label: V1 Test e2e + engine # 30min - timeout_in_minutes: 45 + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ diff --git a/tests/v1/e2e/test_eplb_spec_decode.py b/tests/v1/e2e/test_eplb_spec_decode.py new file mode 100644 index 000000000000..bc9a124bdf54 --- /dev/null +++ b/tests/v1/e2e/test_eplb_spec_decode.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import pytest +import torch + +from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory + + +def create_test_prompts() -> list[str]: + return [ + "A robot may not injure a human being", "To be or not to be,", + "What is the meaning of life?" + ] + + +@pytest.fixture +def sampling_config(): + return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) + + +@pytest.mark.parametrize("model_setup", [ + ("meta-llama/Llama-4-Scout-17B-16E-Instruct", 4), +], + ids=["llama4"]) +def test_eplb_model( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_setup: tuple[str, int], +): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_MLA_DISABLE", "1") + + model_name, tp_size = model_setup + test_prompts = create_test_prompts() + llm = LLM( + model=model_name, + tensor_parallel_size=tp_size, + max_model_len=2048, + enable_expert_parallel=True, + num_redundant_experts=tp_size, + eplb_window_size=1000, + eplb_step_interval=3000, + eplb_log_balancedness=True, + enable_eplb=True, + load_format="dummy", + gpu_memory_utilization=0.95, + ) + test_prompts = create_test_prompts() + llm.generate(test_prompts, sampling_config) + del llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + +@pytest.mark.parametrize( + "model_setup", [ + ("eagle", "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", 4), + ("deepseek_mtp", "eagle618/deepseek-v3-random", None, 4), + ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + ], + ids=["deepseek_eagle", "deepseek_mtp", "qwen3_next_mtp", "llama4_eagle"]) +def test_eplb_spec_decode( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_setup: tuple[str, str, str, int], +): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_MLA_DISABLE", "1") + + method, model_name, spec_model_name, tp_size = model_setup + llm = LLM( + model=model_name, + trust_remote_code=True, + tensor_parallel_size=tp_size, + speculative_config={ + "method": method, + "model": spec_model_name, + "num_speculative_tokens": 1, + "max_model_len": 2048, + }, + max_model_len=2048, + enable_expert_parallel=True, + num_redundant_experts=tp_size, + eplb_window_size=1000, + eplb_step_interval=3000, + eplb_log_balancedness=True, + enable_eplb=True, + load_format="dummy", + ) + test_prompts = create_test_prompts() + llm.generate(test_prompts, sampling_config) + del llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index ed7e7614800f..64a5a05e8397 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -18,10 +18,14 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer, + DeepseekV2MoE, DeepseekV3ForCausalLM) +from vllm.utils import init_logger from .utils import AutoWeightsLoader, maybe_prefix +logger = init_logger(__name__) + @support_torch_compile class DeepseekV2Model(nn.Module): @@ -205,6 +209,45 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.config.vocab_size, scale=logit_scale) + # Set MoE hyperparameters + self.set_moe_parameters() + + def set_moe_parameters(self): + self.expert_weights = [] + + self.num_moe_layers = self.config.num_hidden_layers + self.num_expert_groups = self.config.n_group + + self.moe_layers: list[FusedMoE] = [] + example_moe = None + for layer in self.model.layers: + assert isinstance(layer, DeepseekV2DecoderLayer) + if isinstance(layer.mlp, DeepseekV2MoE): + # Pick last one layer since the first ones may be dense layers. + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning( + "EagleDeepseekV3ForCausalLM: No DeepseekV2MoE layer found in " + "model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = \ + example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index a23d5c5c6dca..601f02bdcd57 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -8,6 +8,7 @@ from transformers import PretrainedConfig from vllm.config import VllmConfig +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -22,6 +23,8 @@ from .interfaces import MixtureOfExperts, SupportsPP from .utils import maybe_prefix +logger = init_logger(__name__) + class SharedHead(nn.Module): @@ -141,12 +144,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = DeepSeekMultiTokenPredictor(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "model")) - config = vllm_config.model_config.hf_config + self.set_moe_parameters() + def set_moe_parameters(self): self.expert_weights = [] # Set MoE hyperparameters - self.num_moe_layers = config.num_nextn_predict_layers - self.num_expert_groups = config.n_group + self.num_moe_layers = self.config.num_nextn_predict_layers + self.num_expert_groups = self.config.n_group self.moe_layers: list[FusedMoE] = [] example_moe = None @@ -159,14 +163,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.moe_layers.append(layer.mlp.experts) if example_moe is None: - raise RuntimeError("No DeepseekV2MoE layer found in model.layers.") - - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("DeepSeekMTP: No DeepseekV2MoE layer " + "found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = \ + example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts def set_eplb_state( self, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 415d36c681d8..3846b40db174 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -40,6 +40,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -65,6 +66,8 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +logger = init_logger(__name__) + class DeepseekV2MLP(nn.Module): @@ -834,12 +837,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + # Set MoE hyperparameters + self.set_moe_parameters() + + def set_moe_parameters(self): self.expert_weights = [] - # Set MoE hyperparameters - self.num_moe_layers = (config.num_hidden_layers - - config.first_k_dense_replace) - self.num_expert_groups = config.n_group + self.num_moe_layers = (self.config.num_hidden_layers - + self.config.first_k_dense_replace) + self.num_expert_groups = self.config.n_group self.moe_layers: list[FusedMoE] = [] example_moe = None @@ -854,14 +860,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.moe_layers.append(layer.mlp.experts) if example_moe is None: - raise RuntimeError("No DeepseekV2MoE layer found in model.layers.") - - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in " + "model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = \ + example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts def set_eplb_state( self, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 6be70c4b3b21..fa35c09e5a59 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -552,7 +552,8 @@ def update_physical_experts_metadata( def is_mixture_of_experts(model: object) -> TypeIs[MixtureOfExperts]: - return isinstance(model, MixtureOfExperts) + return (isinstance(model, MixtureOfExperts) + and getattr(model, "num_moe_layers", 0) > 0) @runtime_checkable diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index da5c06a1ddf4..adee6e1c7bf7 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -29,6 +29,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import get_ep_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -45,6 +46,8 @@ from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk, is_pp_missing_parameter) +logger = init_logger(__name__) + class Llama4MoE(nn.Module): @@ -686,7 +689,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix, layer_type=Llama4DecoderLayer) + # Set MoE hyperparameters + self.set_moe_parameters() + def set_moe_parameters(self): self.expert_weights = [] self.moe_layers: list[FusedMoE] = [] @@ -699,17 +705,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.moe_layers.append(layer.feed_forward.experts) if example_moe is None: - raise RuntimeError("No Llama4MoE layer found in model.layers.") - - # Set MoE hyperparameters - self.num_moe_layers = len(self.moe_layers) - self.num_expert_groups = 1 - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("No Llama4MoE layer found in model.layers.") + else: + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = \ + example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts def set_eplb_state( self, diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index a203af53205c..152f53985bf5 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -193,6 +193,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logit_scale = getattr(self.config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.config.vocab_size, scale=logit_scale) + # Set MoE hyperparameters + self.set_moe_parameters() def forward( self, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 0986ea07406a..9a4f7260b660 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -528,6 +528,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + parallel_config = vllm_config.parallel_config self.prefix = prefix self.vllm_config = vllm_config @@ -561,6 +562,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + if (parallel_config.enable_eplb + and getattr(config, "num_experts", 0) > 0): + raise NotImplementedError("EPLB is not supported for MiniCPM yet.") def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): return MiniCPMModel(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 7c86dca45d21..283a9f111381 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -52,6 +52,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.mtp_start_layer_idx = config.num_hidden_layers self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1) + enable_eplb = vllm_config.parallel_config.enable_eplb self.embed_tokens = VocabParallelEmbedding( self.vocab_size, @@ -75,6 +76,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config=cache_config, quant_config=quant_config, prefix=f'{prefix}.layers.{idx}', + enable_eplb=enable_eplb, ) for idx in range(self.num_mtp_layers)) self.make_empty_intermediate_tensors = ( @@ -246,12 +248,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + self.set_moe_parameters() + def set_moe_parameters(self): self.expert_weights = [] self.moe_layers: list[FusedMoE] = [] example_moe = None - for layer in self.model.layers.values(): + for layer in self.model.layers: assert isinstance(layer, Qwen3NextDecoderLayer) if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): example_moe = layer.mlp @@ -296,7 +300,7 @@ def update_physical_experts_metadata( self.num_local_physical_experts = num_local_physical_experts self.num_redundant_experts = (num_physical_experts - self.num_logical_experts) - for layer in self.model.layers.values(): + for layer in self.model.layers: assert isinstance(layer, Qwen3NextDecoderLayer) if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): moe = layer.mlp diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 32d5b9d5a862..cd556a5c9e9c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2529,8 +2529,7 @@ def load_model(self, eep_scale_up: bool = False) -> None: """ logger.info("Starting to load model %s...", self.model_config.model) global_expert_loads, old_global_expert_indices_per_model, \ - rank_mapping = EplbState.get_epp_state( - self.parallel_config.eep_scale_up) + rank_mapping = EplbState.get_epp_state(eep_scale_up) if self.parallel_config.enable_eplb: self.eplb_state = EplbState(self.parallel_config, self.device) @@ -2551,8 +2550,9 @@ def load_model(self, eep_scale_up: bool = False) -> None: and is_mixture_of_experts(self.drafter.model) and self.parallel_config.enable_eplb): logger.info( - "EPLB is enabled for model %s.", self.vllm_config. - speculative_config.draft_model_config.model) + "EPLB is enabled for drafter model %s.", + self.vllm_config.speculative_config.draft_model_config. + model) global_expert_load = global_expert_loads[ eplb_models] if global_expert_loads else None From e713f4296182bab3fbc0ea8db2364588dbed294f Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 7 Oct 2025 13:19:16 +0000 Subject: [PATCH 08/32] Update spec decode Signed-off-by: ilmarkov --- tests/v1/e2e/test_eplb_spec_decode.py | 45 ++++++++++++++++++--------- vllm/v1/spec_decode/eagle.py | 4 --- vllm/v1/spec_decode/medusa.py | 7 +---- vllm/v1/worker/gpu_model_runner.py | 4 +-- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/tests/v1/e2e/test_eplb_spec_decode.py b/tests/v1/e2e/test_eplb_spec_decode.py index bc9a124bdf54..1919c10b87d5 100644 --- a/tests/v1/e2e/test_eplb_spec_decode.py +++ b/tests/v1/e2e/test_eplb_spec_decode.py @@ -11,8 +11,9 @@ def create_test_prompts() -> list[str]: return [ - "A robot may not injure a human being", "To be or not to be,", - "What is the meaning of life?" + "A robot may not injure a human being", + "To be or not to be,", + "What is the meaning of life?", ] @@ -21,10 +22,13 @@ def sampling_config(): return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) -@pytest.mark.parametrize("model_setup", [ - ("meta-llama/Llama-4-Scout-17B-16E-Instruct", 4), -], - ids=["llama4"]) +@pytest.mark.parametrize( + "model_setup", + [ + ("meta-llama/Llama-4-Scout-17B-16E-Instruct", 4), + ], + ids=["llama4"], +) def test_eplb_model( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, @@ -42,8 +46,8 @@ def test_eplb_model( max_model_len=2048, enable_expert_parallel=True, num_redundant_experts=tp_size, - eplb_window_size=1000, - eplb_step_interval=3000, + eplb_window_size=4, + eplb_step_interval=16, eplb_log_balancedness=True, enable_eplb=True, load_format="dummy", @@ -57,17 +61,28 @@ def test_eplb_model( @pytest.mark.parametrize( - "model_setup", [ - ("eagle", "eagle618/deepseek-v3-random", - "eagle618/eagle-deepseek-v3-random", 4), + "model_setup", + [ + ( + "eagle", + "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", + 4, + ), ("deepseek_mtp", "eagle618/deepseek-v3-random", None, 4), ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4), pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + ( + "eagle", + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", + 4, + ), + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"), + ), ], - ids=["deepseek_eagle", "deepseek_mtp", "qwen3_next_mtp", "llama4_eagle"]) + ids=["deepseek_eagle", "deepseek_mtp", "qwen3_next_mtp", "llama4_eagle"], +) def test_eplb_spec_decode( monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0a2299f97033..cdc164683122 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -11,7 +11,6 @@ from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig, get_layers_from_vllm_config -from vllm.distributed.eplb.eplb_state import EplbState from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger @@ -58,7 +57,6 @@ def __init__( self.method = self.speculative_config.method self.runner = runner - self.eplb_state: Optional[EplbState] = None self.device = device self.dtype = vllm_config.model_config.dtype @@ -1016,8 +1014,6 @@ def load_model(self, target_model: nn.Module) -> None: def dummy_run( self, num_tokens: int, - skip_eplb: bool = False, - is_profile: bool = False, ) -> None: with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): if self.supports_mm_inputs: diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 085f9bc41c35..b8ec1b4e0540 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -1,12 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.distributed.eplb.eplb_state import EplbState from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model @@ -35,7 +33,6 @@ def __init__( vllm_config.speculative_config.draft_model_config.get_hidden_size() ) self.dtype = vllm_config.model_config.dtype - self.eplb_state: Optional[EplbState] = None self.device = device def propose( @@ -64,9 +61,7 @@ def load_model(self, target_model: nn.Module, eep_scale_up: bool = False) -> Non assert not is_mixture_of_experts(self.model) @torch.inference_mode() - def dummy_run( - self, num_tokens: int, skip_eplb: bool = False, is_profile: bool = False - ) -> None: + def dummy_run(self, num_tokens: int) -> None: hidden_states = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d7320fcecb31..42bc067e62cb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3467,9 +3467,7 @@ def _dummy_run( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - self.drafter.dummy_run( - num_tokens, skip_eplb=skip_eplb, is_profile=is_profile - ) + self.drafter.dummy_run(num_tokens) # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real From a70a344d0c6859cc0c6bafcb5fdf2d05ae81adac Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 7 Oct 2025 18:58:20 +0000 Subject: [PATCH 09/32] init Signed-off-by: Sage Moore --- vllm/forward_context.py | 12 ++++- vllm/v1/worker/dp_utils.py | 82 ++++++++++++++++++++++-------- vllm/v1/worker/gpu_model_runner.py | 26 +++++----- 3 files changed, 87 insertions(+), 33 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index a6a1e36bfe95..3fab185eb521 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -12,6 +12,7 @@ import vllm.envs as envs from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger +from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ubatch_utils import UBatchSlices if TYPE_CHECKING: @@ -267,7 +268,16 @@ def set_forward_context( if vllm_config.parallel_config.data_parallel_size > 1 and ( attn_metadata is not None or num_tokens is not None ): - assert num_tokens_across_dp is not None + if num_tokens_across_dp is None: + assert ubatch_slices is None + assert num_tokens is not None + _, num_tokens_across_dp = coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens, + parallel_config=vllm_config.parallel_config, + allow_microbatching=False, + allow_dp_padding=False, + ) + assert num_tokens_across_dp is not None dp_metadata = DPMetadata.make( vllm_config.parallel_config, num_tokens or 0, num_tokens_across_dp ) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 7a943909a8ba..143a5cdfdbb7 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -37,6 +37,7 @@ def _get_device_and_group(parallel_config: ParallelConfig): def _run_ar( should_ubatch: bool, + should_dp_pad: bool, orig_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int, parallel_config: ParallelConfig, @@ -44,10 +45,11 @@ def _run_ar( dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_rank device, group = _get_device_and_group(parallel_config) - tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32) + tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32) tensor[0][dp_rank] = orig_num_tokens_per_ubatch tensor[1][dp_rank] = padded_num_tokens_per_ubatch tensor[2][dp_rank] = 1 if should_ubatch else 0 + tensor[3][dp_rank] = 1 if should_dp_pad else 0 dist.all_reduce(tensor, group=group) return tensor @@ -76,6 +78,7 @@ def _synchronize_dp_ranks( num_tokens_unpadded: int, num_tokens_padded: int, should_attempt_ubatching: bool, + allow_dp_padding: bool, parallel_config: ParallelConfig, ) -> tuple[bool, Optional[torch.Tensor]]: """ @@ -100,17 +103,29 @@ def _synchronize_dp_ranks( # will run and if we are using ubatching or not. tensor = _run_ar( should_ubatch=should_attempt_ubatching, + should_dp_pad=allow_dp_padding, orig_num_tokens_per_ubatch=num_tokens_unpadded, padded_num_tokens_per_ubatch=num_tokens_padded, parallel_config=parallel_config, ) - # Ensure that each rank is processing the same nuber of tokens num_tokens_across_dp = tensor[1, :] - max_num_tokens = int(num_tokens_across_dp.max().item()) - num_tokens_after_padding = torch.tensor( - [max_num_tokens] * len(num_tokens_across_dp), device="cpu", dtype=torch.int32 - ) + should_dp_pad = bool(torch.all(tensor[3] == 1).item()) + + # DP ranks should all have the same value for allow_padding + assert allow_dp_padding == should_dp_pad + + if should_dp_pad: + # If DP padding is enabled, ensure that each rank is processing the same number + # of tokens + max_num_tokens = int(num_tokens_across_dp.max().item()) + num_tokens_after_padding = torch.tensor( + [max_num_tokens] * len(num_tokens_across_dp), + device="cpu", + dtype=torch.int32, + ) + else: + num_tokens_after_padding = num_tokens_across_dp should_ubatch = _post_process_ubatch(tensor) @@ -118,22 +133,44 @@ def _synchronize_dp_ranks( def coordinate_batch_across_dp( - num_scheduled_tokens_per_request: np.ndarray, num_tokens_unpadded: int, - num_tokens_padded: int, - parallel_config: ParallelConfig, allow_microbatching: bool, - uniform_decode: bool, + allow_dp_padding: bool, + parallel_config: ParallelConfig, + num_tokens_padded: Optional[int] = None, + uniform_decode: Optional[bool] = None, + num_scheduled_tokens_per_request: Optional[np.ndarray] = None, ) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]: """ Coordinates amongst all DP ranks to determine if and how the full batch should be split into microbatches. + Args: + num_tokens_unpadded: Number of tokens without accounting for padding + + allow_microbatching: If microbatching should be attempted + + allow_dp_padding: If all DP ranks should be padded up to the same value + + parallel_config: The parallel config + + num_tokens_padded: Number of tokens including any non-DP padding (CUDA graphs, + TP, etc) + + uniform_decode: Only used if allow_microbatching is True. True if the batch + only contains single token decodes + + num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The + number of tokens per request. + Returns: tuple[ ubatch_slices: if this is set then all DP ranks have agreed to microbatch + num_tokens_after_padding: A tensor containing the total number of - tokens per-microbatch for each DP rank including padding. + tokens per-microbatch for each DP rank including padding. Will be + padded up to the max value across all DP ranks when allow_dp_padding + is True. ] """ @@ -141,21 +178,25 @@ def coordinate_batch_across_dp( # Early exit. return None, None - # Check preconditions for microbatching - should_attempt_ubatching = check_ubatch_thresholds( - parallel_config, - num_tokens_unpadded, - uniform_decode=uniform_decode, - ) + # If the caller has explicitly enabled microbatching. + should_attempt_ubatching = False + if allow_microbatching: + # Check preconditions for microbatching + assert uniform_decode is not None + should_attempt_ubatching = check_ubatch_thresholds( + parallel_config, + num_tokens_unpadded, + uniform_decode=uniform_decode, + ) - # If the caller has explicitly disabled microbatching. - if not allow_microbatching: - should_attempt_ubatching = False + if num_tokens_padded is None: + num_tokens_padded = num_tokens_unpadded (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks( num_tokens_unpadded, num_tokens_padded, should_attempt_ubatching, + allow_dp_padding, parallel_config, ) @@ -170,6 +211,7 @@ def coordinate_batch_across_dp( assert num_tokens_after_padding is not None token_split_point = int(num_tokens_after_padding[0].item()) // 2 + assert num_scheduled_tokens_per_request is not None ubatch_slices = create_ubatch_slices( num_scheduled_tokens_per_request, token_split_point ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 42bc067e62cb..e286a55cf993 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1170,12 +1170,13 @@ def _prepare_inputs( max_num_scheduled_tokens == self.uniform_decode_query_len ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp( - num_scheduled_tokens, - num_tokens_unpadded, - num_tokens_padded, - self.parallel_config, - True, - uniform_decode, + num_tokens_unpadded=num_tokens_unpadded, + parallel_config=self.parallel_config, + allow_microbatching=True, + allow_dp_padding=True, + num_tokens_padded=num_tokens_padded, + uniform_decode=uniform_decode, + num_scheduled_tokens_per_request=num_scheduled_tokens, ) self.seq_lens.np[:num_reqs] = ( @@ -3284,12 +3285,13 @@ def _dummy_run( # We currently only microbatch if the number of tokens is # over a certain threshold. ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp( - num_scheduled_tokens, - total_num_scheduled_tokens, - total_num_scheduled_tokens, - self.vllm_config.parallel_config, - allow_microbatching, - uniform_decode, + num_tokens_unpadded=total_num_scheduled_tokens, + parallel_config=self.vllm_config.parallel_config, + allow_microbatching=allow_microbatching, + allow_dp_padding=True, + num_tokens_padded=total_num_scheduled_tokens, + uniform_decode=uniform_decode, + num_scheduled_tokens_per_request=num_scheduled_tokens, ) num_tokens_after_padding = num_tokens if num_tokens_across_dp is not None: From 3b51ef906c7de7b588b8b829bdd70457c159b16c Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 7 Oct 2025 19:03:03 +0000 Subject: [PATCH 10/32] comment Signed-off-by: Sage Moore --- vllm/v1/worker/dp_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 143a5cdfdbb7..4eaff178b178 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -112,7 +112,7 @@ def _synchronize_dp_ranks( num_tokens_across_dp = tensor[1, :] should_dp_pad = bool(torch.all(tensor[3] == 1).item()) - # DP ranks should all have the same value for allow_padding + # DP ranks should all have the same value for allow_dp_padding assert allow_dp_padding == should_dp_pad if should_dp_pad: From 123c8e66090ec868668a9316bf7aed94aa768d33 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Wed, 8 Oct 2025 14:02:41 +0000 Subject: [PATCH 11/32] Update qwen next Signed-off-by: ilmarkov --- vllm/model_executor/models/qwen3_next_mtp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 220ac1ef30ff..75c1086b9e46 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -63,7 +63,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.mtp_start_layer_idx = config.num_hidden_layers self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1) - enable_eplb = vllm_config.parallel_config.enable_eplb self.embed_tokens = VocabParallelEmbedding( self.vocab_size, @@ -86,7 +85,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config, layer_type="full_attention", prefix=f"{prefix}.layers.{idx}", - enable_eplb=enable_eplb, ) for idx in range(self.num_mtp_layers) ) From 27b6437cb4b1cfc7bedab66690557c35daf8efa6 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 13 Oct 2025 12:09:10 +0000 Subject: [PATCH 12/32] Cleanup Signed-off-by: ilmarkov --- vllm/v1/spec_decode/eagle.py | 1 - vllm/v1/spec_decode/medusa.py | 8 +++++--- vllm/v1/worker/dp_utils.py | 1 - 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 64062a940d25..ad504da55fd8 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -61,7 +61,6 @@ def __init__( self.runner = runner self.device = device - self.dtype = vllm_config.model_config.dtype self.max_model_len = vllm_config.model_config.max_model_len self.block_size = vllm_config.cache_config.block_size diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index b8ec1b4e0540..12b903ccaca9 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -33,7 +33,6 @@ def __init__( vllm_config.speculative_config.draft_model_config.get_hidden_size() ) self.dtype = vllm_config.model_config.dtype - self.device = device def propose( self, @@ -50,7 +49,7 @@ def propose( draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits] return [list(row) for row in zip(*draft_tokens)] - def load_model(self, target_model: nn.Module, eep_scale_up: bool = False) -> None: + def load_model(self, target_model: nn.Module) -> None: from vllm.compilation.backends import set_model_tag with set_model_tag("medusa_head"): @@ -58,7 +57,10 @@ def load_model(self, target_model: nn.Module, eep_scale_up: bool = False) -> Non vllm_config=self.vllm_config, model_config=self.vllm_config.speculative_config.draft_model_config, ) - assert not is_mixture_of_experts(self.model) + assert not ( + is_mixture_of_experts(self.model) + and self.vllm_config.parallel_config.enable_eplb + ), "EPLB for Medusa is not supported" @torch.inference_mode() def dummy_run(self, num_tokens: int) -> None: diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 5dba8dd9b09c..3f24ff0a09de 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -178,7 +178,6 @@ def coordinate_batch_across_dp( Returns: tuple[ ubatch_slices: if this is set then all DP ranks have agreed to microbatch - num_tokens_after_padding: A tensor containing the total number of tokens per-microbatch for each DP rank including padding. Will be padded up to the max value across all DP ranks when allow_dp_padding From ff9f9927db162b91475760e68ad13a0c373b6a74 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 14 Oct 2025 14:34:34 +0000 Subject: [PATCH 13/32] Update after review Signed-off-by: ilmarkov --- tests/v1/e2e/test_eplb_spec_decode.py | 62 +++++++++++++++++++---- vllm/distributed/eplb/eplb_state.py | 4 +- vllm/model_executor/models/deepseek_v2.py | 2 +- vllm/model_executor/models/ernie45_moe.py | 2 +- vllm/model_executor/models/glm4_moe.py | 2 +- vllm/model_executor/models/hunyuan_v1.py | 2 +- vllm/model_executor/models/lfm2_moe.py | 2 +- vllm/model_executor/models/llama4.py | 19 ++++--- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_next.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 4 +- 12 files changed, 75 insertions(+), 30 deletions(-) diff --git a/tests/v1/e2e/test_eplb_spec_decode.py b/tests/v1/e2e/test_eplb_spec_decode.py index 1919c10b87d5..98a830c96fbc 100644 --- a/tests/v1/e2e/test_eplb_spec_decode.py +++ b/tests/v1/e2e/test_eplb_spec_decode.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.utils import large_gpu_mark from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory @@ -22,10 +23,32 @@ def sampling_config(): return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) +def check_outputs(ref_outputs, spec_outputs): + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") + + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) + + @pytest.mark.parametrize( "model_setup", [ - ("meta-llama/Llama-4-Scout-17B-16E-Instruct", 4), + pytest.param( + ( + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + 4, + ), + marks=large_gpu_mark(min_gb=80), + ), # works on 4x H100 ], ids=["llama4"], ) @@ -40,7 +63,7 @@ def test_eplb_model( model_name, tp_size = model_setup test_prompts = create_test_prompts() - llm = LLM( + eplb_llm = LLM( model=model_name, tensor_parallel_size=tp_size, max_model_len=2048, @@ -50,12 +73,21 @@ def test_eplb_model( eplb_step_interval=16, eplb_log_balancedness=True, enable_eplb=True, - load_format="dummy", gpu_memory_utilization=0.95, ) + eplb_outputs = eplb_llm.generate(test_prompts, sampling_config) + del eplb_llm + torch.cuda.empty_cache() + + ref_llm = LLM( + model=model_name, + max_model_len=2048, + tensor_parallel_size=tp_size, + ) test_prompts = create_test_prompts() - llm.generate(test_prompts, sampling_config) - del llm + ref_outputs = ref_llm.generate(test_prompts, sampling_config) + check_outputs(ref_outputs, eplb_outputs) + del ref_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() @@ -69,8 +101,8 @@ def test_eplb_model( "eagle618/eagle-deepseek-v3-random", 4, ), - ("deepseek_mtp", "eagle618/deepseek-v3-random", None, 4), - ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4), + ("deepseek_mtp", "eagle618/deepseek-v3-random", None, 2), + ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 2), pytest.param( ( "eagle", @@ -93,7 +125,7 @@ def test_eplb_spec_decode( m.setenv("VLLM_MLA_DISABLE", "1") method, model_name, spec_model_name, tp_size = model_setup - llm = LLM( + spec_llm = LLM( model=model_name, trust_remote_code=True, tensor_parallel_size=tp_size, @@ -110,10 +142,18 @@ def test_eplb_spec_decode( eplb_step_interval=3000, eplb_log_balancedness=True, enable_eplb=True, - load_format="dummy", ) test_prompts = create_test_prompts() - llm.generate(test_prompts, sampling_config) - del llm + spec_outputs = spec_llm.generate(test_prompts, sampling_config) + del spec_llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + ref_llm = LLM( + model=model_name, max_model_len=2048, tensor_parallel_size=tp_size + ) + ref_outputs = ref_llm.generate(test_prompts, sampling_config) + check_outputs(ref_outputs, spec_outputs) + del ref_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 16ba1a164d4d..42858e6be068 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -708,14 +708,12 @@ def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]: @classmethod def get_eep_state( - cls, parallel_config: ParallelConfig, eep_scale_up: bool = False + cls, parallel_config: ParallelConfig ) -> tuple[ list[torch.Tensor] | None, list[torch.Tensor] | None, dict[int, int] | None, ]: - if not eep_scale_up: - return None, None, None num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu") torch.distributed.broadcast( num_local_physical_experts, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 2646a59de5d2..ca676b423b4f 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -162,7 +162,7 @@ def __init__( self.routed_scaling_factor = config.routed_scaling_factor self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts: int = config.n_routed_experts self.n_shared_experts: int = config.n_shared_experts diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index e01f26731cd9..a3828bacbba2 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -132,7 +132,7 @@ def __init__( self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", None) self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_groupn_group self.ep_size = self.ep_group.size() self.n_routed_experts: int = config.moe_num_experts self.n_shared_experts: int = self.moe_num_shared_experts diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index a53f52852c6a..8a21b256b06a 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -127,7 +127,7 @@ def __init__( self.routed_scaling_factor = config.routed_scaling_factor self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts: int = config.n_routed_experts self.n_shared_experts: int = config.n_shared_experts diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 901f29310872..f49923095394 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -374,7 +374,7 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts = config.num_experts diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index bb7926a9cfa9..c81f3514f882 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -105,7 +105,7 @@ def __init__( self.routed_scaling_factor = config.routed_scaling_factor self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts = config.num_experts diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a84cb5bdd4ab..55f21dba8f54 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -61,8 +61,6 @@ logger = init_logger(__name__) -logger = init_logger(__name__) - class Llama4MoE(nn.Module): @staticmethod @@ -88,7 +86,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): self.top_k = config.num_experts_per_tok self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() intermediate_size_moe = config.intermediate_size @@ -404,6 +402,9 @@ def __init__( layer_type: type[Llama4DecoderLayer] = Llama4DecoderLayer, ): self.num_experts = vllm_config.model_config.hf_config.num_local_experts + self.n_redundant_experts = ( + vllm_config.parallel_config.eplb_config.num_redundant_experts + ) super().__init__(vllm_config=vllm_config, prefix=prefix, layer_type=layer_type) def load_moe_expert_weights( @@ -510,6 +511,8 @@ def load_moe_expert_weights( .flatten() .to(new_loaded_weight.device) ) + # Take redundant experts into account + local_expert_indices %= new_loaded_weight.shape[0] new_loaded_weight = new_loaded_weight[local_expert_indices] expert_id = local_expert_indices[0].item() else: @@ -518,16 +521,17 @@ def load_moe_expert_weights( # Load the weight into the module parameter with corresponding # shard id and expert id. - weight_loader( + success = weight_loader( param, new_loaded_weight, full_param_name, shard_id=shard_id, expert_id=expert_id, + return_success=True, ) - - loaded_params.add(full_param_name) - expert_param_loaded = True + if success: + loaded_params.add(full_param_name) + expert_param_loaded = True return expert_param_loaded @@ -552,6 +556,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", num_experts=self.num_experts, + num_redundant_experts=self.n_redundant_experts, ) # Expert parameter mapping for the case where the expert weights are # fused into a single weight tensor. diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index bc56481820a9..f314633de0ef 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -98,7 +98,7 @@ def __init__( self.hidden_size = hidden_size self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() # Expert Parallelism Load balancing settings. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 8452d7b04f5c..59e625cd9a1d 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -132,7 +132,7 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts = config.num_experts diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index a29def57c4a0..254f0095e3a7 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -107,7 +107,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): self.tp_size = get_tensor_model_parallel_world_size() self.ep_group = get_ep_group().device_group - self.ep_rank = self.ep_group.rank() + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts = config.num_experts diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3b9efad46468..43b14e046e8c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2849,7 +2849,9 @@ def load_model(self, eep_scale_up: bool = False) -> None: """ logger.info("Starting to load model %s...", self.model_config.model) global_expert_loads, old_global_expert_indices_per_model, rank_mapping = ( - EplbState.get_eep_state(eep_scale_up) + EplbState.get_eep_state(self.parallel_config) + if eep_scale_up + else (None, None, None) ) if self.parallel_config.enable_eplb: From d4532a62afdea1c260218a15dd077e6258872428 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Wed, 15 Oct 2025 08:55:43 +0000 Subject: [PATCH 14/32] Update buildkite pipeline test time Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c2f1fed33993..4b14fdbd19f4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -268,7 +268,7 @@ steps: - pytest -v -s tokenization - label: V1 Test e2e + engine # 30min - timeout_in_minutes: 50 + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ From 7c5b5b1c863aee4103fa7f70564b43a3dd5c1759 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Thu, 23 Oct 2025 16:59:16 +0000 Subject: [PATCH 15/32] Improve sync. Update after review Signed-off-by: ilmarkov --- tests/v1/e2e/test_eplb_spec_decode.py | 62 ++------------------ vllm/distributed/eplb/eplb_state.py | 83 ++++++++++++++++++++++----- vllm/v1/worker/gpu_worker.py | 50 ++++++++-------- 3 files changed, 95 insertions(+), 100 deletions(-) diff --git a/tests/v1/e2e/test_eplb_spec_decode.py b/tests/v1/e2e/test_eplb_spec_decode.py index 98a830c96fbc..8642c995a9c2 100644 --- a/tests/v1/e2e/test_eplb_spec_decode.py +++ b/tests/v1/e2e/test_eplb_spec_decode.py @@ -5,7 +5,6 @@ import pytest import torch -from tests.utils import large_gpu_mark from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory @@ -20,7 +19,7 @@ def create_test_prompts() -> list[str]: @pytest.fixture def sampling_config(): - return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) + return SamplingParams(temperature=0, max_tokens=64, ignore_eos=False) def check_outputs(ref_outputs, spec_outputs): @@ -39,59 +38,6 @@ def check_outputs(ref_outputs, spec_outputs): assert matches > int(0.66 * len(ref_outputs)) -@pytest.mark.parametrize( - "model_setup", - [ - pytest.param( - ( - "meta-llama/Llama-4-Scout-17B-16E-Instruct", - 4, - ), - marks=large_gpu_mark(min_gb=80), - ), # works on 4x H100 - ], - ids=["llama4"], -) -def test_eplb_model( - monkeypatch: pytest.MonkeyPatch, - sampling_config: SamplingParams, - model_setup: tuple[str, int], -): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - m.setenv("VLLM_MLA_DISABLE", "1") - - model_name, tp_size = model_setup - test_prompts = create_test_prompts() - eplb_llm = LLM( - model=model_name, - tensor_parallel_size=tp_size, - max_model_len=2048, - enable_expert_parallel=True, - num_redundant_experts=tp_size, - eplb_window_size=4, - eplb_step_interval=16, - eplb_log_balancedness=True, - enable_eplb=True, - gpu_memory_utilization=0.95, - ) - eplb_outputs = eplb_llm.generate(test_prompts, sampling_config) - del eplb_llm - torch.cuda.empty_cache() - - ref_llm = LLM( - model=model_name, - max_model_len=2048, - tensor_parallel_size=tp_size, - ) - test_prompts = create_test_prompts() - ref_outputs = ref_llm.generate(test_prompts, sampling_config) - check_outputs(ref_outputs, eplb_outputs) - del ref_llm - torch.cuda.empty_cache() - cleanup_dist_env_and_memory() - - @pytest.mark.parametrize( "model_setup", [ @@ -132,14 +78,14 @@ def test_eplb_spec_decode( speculative_config={ "method": method, "model": spec_model_name, - "num_speculative_tokens": 1, + "num_speculative_tokens": 4, "max_model_len": 2048, }, max_model_len=2048, enable_expert_parallel=True, num_redundant_experts=tp_size, - eplb_window_size=1000, - eplb_step_interval=3000, + eplb_window_size=8, + eplb_step_interval=32, eplb_log_balancedness=True, enable_eplb=True, ) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 42858e6be068..9d5601b43837 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -432,18 +432,17 @@ def step( eplb_model_state.expert_load_pass.zero_() if log_stats: - # total_expert_load_pass: (num_moe_layers, num_physical_experts) - for eplb_model_state in self.model_states.values(): - total_expert_load_pass = eplb_model_state.expert_load_pass.clone() - - # Collect load metrics from all ranks - ep_group = get_ep_group().device_group - all_reduce(total_expert_load_pass, group=ep_group) - + # Sync the expert load pass for each model (main and drafter). + # expert_load_pass: (num_moe_layers, num_physical_experts) + expert_load_pass_list = self._sync_load_pass() + ep_group = get_ep_group().device_group + for expert_load_pass, eplb_model_state in zip( + expert_load_pass_list, self.model_states.values() + ): # num_tokens_per_rank: (num_moe_layers, num_ranks) num_tokens_per_rank = ( - total_expert_load_pass.reshape( - total_expert_load_pass.shape[0], ep_group.size(), -1 + expert_load_pass.reshape( + expert_load_pass.shape[0], ep_group.size(), -1 ) .sum(dim=-1) .float() @@ -502,6 +501,19 @@ def rearrange( ) -> torch.Tensor | None: """ Rearrange the experts according to the current load. + + Args: + is_profile (bool): If `True`, perform a dummy rearrangement. + This is used in `profile_run` to reserve enough memory, + no memory movement will be performed. Default is False. + execute_shuffle (bool): If `True`, execute the shuffle in eep. + Default is True. + global_expert_loads (list[torch.Tensor] | None): + The global expert loads when scaling is done in eep. + List of expert loads for the main and drafter + (when spec decode is used) models. + rank_mapping (dict[int, int] | None): The rank mapping when scaling + is done in eep. """ ep_group = get_ep_group().device_group @@ -555,17 +567,21 @@ def rearrange( metadata, group=get_ep_group().cpu_group, group_src=0 ) - # Perform all-reduce to get the expert load across all ranks global_expert_load_window = logical_expert_load_window.sum(dim=0) - all_reduce(global_expert_load_window, group=ep_group) - - if not execute_shuffle: + global_expert_load_windows.append(global_expert_load_window) + # Perform all-reduce to get the expert load across all ranks for each model + global_expert_load_windows = self._allreduce_list( + global_expert_load_windows + ) + if not execute_shuffle: + for eplb_model_state, global_expert_load_window in zip( + self.model_states.values(), global_expert_load_windows + ): # (num_moe_layers, old_num_physical_experts) old_global_expert_indices = eplb_model_state.physical_to_logical_map torch.distributed.broadcast( old_global_expert_indices, group=ep_group, group_src=0 ) - global_expert_load_windows.append(global_expert_load_window) if not execute_shuffle: return global_expert_load_windows else: @@ -746,6 +762,43 @@ def get_eep_state( rank_mapping, ) + def _allreduce_list(self, tensor_list: list[torch.Tensor]) -> list[torch.Tensor]: + """ + All-reduce a list of tensors. + """ + if len(tensor_list) == 1: + all_reduce(tensor_list[0], group=get_ep_group().device_group) + return tensor_list + assert all(t.dim() == 2 for t in tensor_list), "All tensors must be 2D." + assert all(t.shape[1] == tensor_list[0].shape[1] for t in tensor_list), ( + "All tensors must have the same shape[1]." + ) + # Concatenate, all_reduce, then unpack to original shapes. + # We assume all tensors are 2D and shape[1] (num_physical_experts) + # is the same across all models. + shapes = [t.shape for t in tensor_list] + concat_tensor = torch.cat(tensor_list, dim=0) + + ep_group = get_ep_group().device_group + all_reduce(concat_tensor, group=ep_group) + + all_reduce_list = [] + offset = 0 + for shape in shapes: + all_reduce_list.append(concat_tensor[offset : offset + shape[0], :]) + offset += shape[0] + return all_reduce_list + + def _sync_load_pass(self) -> list[torch.Tensor]: + """ + Sync the expert load pass across all ranks for log stats. + Doesn't update the expert load pass in eplb_model_state. + """ + load_pass_list = [] + for eplb_model_state in self.model_states.values(): + load_pass_list.append(eplb_model_state.expert_load_pass.clone()) + return self._allreduce_list(load_pass_list) + def _node_count_with_rank_mapping( pg: ProcessGroup | StatelessProcessGroup, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 61c94adf33bc..a26632f072de 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -629,7 +629,17 @@ def _reconfigure_moe( parallel_config = self.vllm_config.parallel_config - def update_moe_modules(moe_modules: list[FusedMoE]): + def get_moe_modules(model: torch.nn.Module) -> list[FusedMoE]: + return [ + module + for module in model.modules() + if ( + module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE" + ) + ] + + def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int): assert all( module.moe_config.num_local_experts == num_local_experts for module in moe_modules @@ -643,36 +653,24 @@ def update_moe_modules(moe_modules: list[FusedMoE]): vllm_parallel_config=parallel_config, ) module.moe_config.moe_parallel_config = module.moe_parallel_config + return moe_modules - model_moe_modules = [ - module - for module in self.model_runner.model.modules() - if ( - module.__class__.__name__ == "FusedMoE" - or module.__class__.__name__ == "SharedFusedMoE" - ) - ] + model_moe_modules = get_moe_modules(self.model_runner.model) num_local_experts = model_moe_modules[0].moe_config.num_local_experts - update_moe_modules(model_moe_modules) - if ( - hasattr(self.model_runner, "drafter") - and hasattr(self.model_runner.drafter, "model") - and is_mixture_of_experts(self.model_runner.drafter.model) + update_moe_modules(model_moe_modules, num_local_experts) + drafter_model = None + if hasattr(self.model_runner, "drafter") and hasattr( + self.model_runner.drafter, "model" ): - drafter_moe_modules = [ - module - for module in self.model_runner.drafter.model.modules() - if ( - module.__class__.__name__ == "FusedMoE" - or module.__class__.__name__ == "SharedFusedMoE" - ) - ] + drafter_model = self.model_runner.drafter.model + if drafter_model is not None and is_mixture_of_experts(drafter_model): + drafter_moe_modules = get_moe_modules(drafter_model) # Check if drafter and model have matching configs assert ( drafter_moe_modules[0].moe_config.num_local_experts == num_local_experts ), "Drafter and model configs should be the same" - update_moe_modules(drafter_moe_modules) + update_moe_modules(drafter_moe_modules, num_local_experts) if new_ep_size < old_ep_size: num_local_physical_experts = num_local_experts @@ -702,10 +700,8 @@ def update_moe_modules(moe_modules: list[FusedMoE]): new_physical_experts - global_expert_loads[0].shape[1] ) prepare_communication_buffer_for_model(self.model_runner.model) - if hasattr(self.model_runner, "drafter") and hasattr( - self.model_runner.drafter, "model" - ): - prepare_communication_buffer_for_model(self.model_runner.drafter.model) + if drafter_model is not None: + prepare_communication_buffer_for_model(drafter_model) self.model_runner.model.update_physical_experts_metadata( num_physical_experts=new_physical_experts, num_local_physical_experts=num_local_physical_experts, From 96d4b37681207da1cdda2d92353f91fd6e2c3453 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Thu, 23 Oct 2025 17:00:30 +0000 Subject: [PATCH 16/32] Fix comment Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f8710406f1c3..8c593d8d3fd9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -267,7 +267,7 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test e2e + engine # 30min +- label: V1 Test e2e + engine # 60min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: From 477a955c46fa3ebddf0bd1a50cb6f1d64748a16e Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 27 Oct 2025 09:17:29 +0000 Subject: [PATCH 17/32] Refactor Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 4 +- tests/v1/e2e/test_eplb_spec_decode.py | 16 ++- vllm/model_executor/layers/fused_moe/layer.py | 1 + vllm/model_executor/models/deepseek_eagle.py | 38 +----- vllm/model_executor/models/deepseek_mtp.py | 65 ++-------- vllm/model_executor/models/deepseek_v2.py | 107 ++++++++--------- vllm/model_executor/models/ernie45_moe.py | 16 --- vllm/model_executor/models/glm4_moe.py | 59 +++++----- vllm/model_executor/models/glm4_moe_mtp.py | 34 ++---- vllm/model_executor/models/hunyuan_v1.py | 18 +-- vllm/model_executor/models/interfaces.py | 14 ++- vllm/model_executor/models/lfm2_moe.py | 18 +-- vllm/model_executor/models/llama4.py | 18 +-- vllm/model_executor/models/mixtral.py | 18 +-- vllm/model_executor/models/nemotron_h.py | 18 +-- vllm/model_executor/models/qwen3_moe.py | 18 +-- vllm/model_executor/models/qwen3_next.py | 111 ++++++++---------- vllm/model_executor/models/qwen3_next_mtp.py | 64 +--------- 18 files changed, 190 insertions(+), 447 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0eb317f531c9..d940bd6b2349 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -269,8 +269,8 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test e2e + engine # 60min - timeout_in_minutes: 60 +- label: V1 Test e2e + engine # 50min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ diff --git a/tests/v1/e2e/test_eplb_spec_decode.py b/tests/v1/e2e/test_eplb_spec_decode.py index 8642c995a9c2..aa094d262cc4 100644 --- a/tests/v1/e2e/test_eplb_spec_decode.py +++ b/tests/v1/e2e/test_eplb_spec_decode.py @@ -47,8 +47,20 @@ def check_outputs(ref_outputs, spec_outputs): "eagle618/eagle-deepseek-v3-random", 4, ), - ("deepseek_mtp", "eagle618/deepseek-v3-random", None, 2), - ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 2), + pytest.param( + "deepseek_mtp", + "eagle618/deepseek-v3-random", + None, + 2, + marks=pytest.mark.skip(reason="Skipping for CI test time savings"), + ), + pytest.param( + "qwen3_next_mtp", + "Qwen/Qwen3-Next-80B-A3B-Instruct", + None, + 2, + marks=pytest.mark.skip(reason="Skipping for CI test time savings"), + ), pytest.param( ( "eagle", diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 1667bfd4c7eb..9c067fd8f968 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1948,6 +1948,7 @@ def get_expert_weights(self) -> Iterable[torch.Tensor]: if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) and not name.startswith("_shared_experts.") + and not name.startswith("_gate") ] def set_eplb_state( diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 5caa21445ed3..fd2f20ea501d 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -22,7 +22,6 @@ ) from vllm.model_executor.models.deepseek_v2 import ( DeepseekV2DecoderLayer, - DeepseekV2MoE, DeepseekV3ForCausalLM, ) from vllm.utils import init_logger @@ -220,47 +219,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) # Set MoE hyperparameters + self.num_moe_layers = self.config.num_hidden_layers self.set_moe_parameters() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) - def set_moe_parameters(self): - self.expert_weights = [] - - self.num_moe_layers = self.config.num_hidden_layers - self.num_expert_groups = self.config.n_group - - self.moe_layers: list[FusedMoE] = [] - example_moe = None - for layer in self.model.layers: - assert isinstance(layer, DeepseekV2DecoderLayer) - if isinstance(layer.mlp, DeepseekV2MoE): - # Pick last one layer since the first ones may be dense layers. - example_moe = layer.mlp - self.moe_layers.append(layer.mlp.experts) - - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning( - "EagleDeepseekV3ForCausalLM: No DeepseekV2MoE layer found in " - "model.layers." - ) - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 31a028114e32..6c9bba44a217 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -26,10 +26,11 @@ from .deepseek_v2 import ( DeepseekV2DecoderLayer, + DeepseekV2MixtureOfExperts, DeepseekV2MoE, get_spec_layer_idx_from_weight_name, ) -from .interfaces import MixtureOfExperts, SupportsPP +from .interfaces import SupportsPP from .utils import maybe_prefix logger = init_logger(__name__) @@ -123,6 +124,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.mtp_start_layer_idx = config.num_hidden_layers self.num_mtp_layers = config.num_nextn_predict_layers # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict( { str(idx): DeepSeekMultiTokenPredictorLayer( @@ -176,7 +178,7 @@ def compute_logits( @support_torch_compile -class DeepSeekMTP(nn.Module, SupportsPP, MixtureOfExperts): +class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config @@ -192,7 +194,8 @@ def set_moe_parameters(self): self.num_moe_layers = self.config.num_nextn_predict_layers self.num_expert_groups = self.config.n_group - self.moe_layers: list[FusedMoE] = [] + self.moe_layers = [] + self.moe_mlp_layers = [] example_moe = None for layer in self.model.layers.values(): assert isinstance(layer, DeepSeekMultiTokenPredictorLayer) @@ -200,61 +203,9 @@ def set_moe_parameters(self): assert isinstance(layer, DeepseekV2DecoderLayer) if isinstance(layer.mlp, DeepseekV2MoE): example_moe = layer.mlp + self.moe_mlp_layers.append(layer.mlp) self.moe_layers.append(layer.mlp.experts) - - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("DeepSeekMTP: No DeepseekV2MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for layer in self.model.layers.values(): - assert isinstance(layer, DeepSeekMultiTokenPredictorLayer) - layer = layer.mtp_block - assert isinstance(layer, DeepseekV2DecoderLayer) - if isinstance(layer.mlp, DeepseekV2MoE): - moe = layer.mlp - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() + self.extract_moe_parameters(example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 061446b9e81d..a253cdffd901 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1122,7 +1122,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) else: self.embed_tokens = PPMissingLayer() - self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: DeepseekV2DecoderLayer( @@ -1172,7 +1171,50 @@ def forward( return hidden_states -class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoRA): +class DeepseekV2MixtureOfExperts(MixtureOfExperts): + moe_mlp_layers: list[DeepseekV2MoE] + """ + List of MoE MLP layers in the model. + """ + + def extract_moe_parameters(self, example_moe: DeepseekV2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class DeepseekV2ForCausalLM( + nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA +): packed_modules_mapping = { "gate_up_proj": ["gate_proj", "up_proj"], } @@ -1214,17 +1256,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model.make_empty_intermediate_tensors ) # Set MoE hyperparameters + self.num_moe_layers = ( + self.config.num_hidden_layers - self.config.first_k_dense_replace + ) self.set_moe_parameters() def set_moe_parameters(self): self.expert_weights = [] - self.num_moe_layers = ( - self.config.num_hidden_layers - self.config.first_k_dense_replace - ) self.num_expert_groups = self.config.n_group - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers = [] + self.moe_mlp_layers = [] example_moe = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): @@ -1234,58 +1277,10 @@ def set_moe_parameters(self): if isinstance(layer.mlp, DeepseekV2MoE): # Pick last one layer since the first ones may be dense layers. example_moe = layer.mlp + self.moe_mlp_layers.append(layer.mlp) self.moe_layers.append(layer.mlp.experts) - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("DeepSeekV2: No DeepseekV2MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for layer in self.model.layers: - if isinstance(layer.mlp, DeepseekV2MoE): - moe = layer.mlp - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() + self.extract_moe_parameters(example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index db177782aabc..170ff183dab8 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -707,22 +707,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_shared_experts = example_moe.n_shared_experts self.num_redundant_experts = example_moe.n_redundant_experts - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 8a21b256b06a..54412585c6a9 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -62,7 +62,7 @@ ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -616,7 +616,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params -class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): +class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -671,31 +671,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): example_moe = layer.mlp self.moe_layers.append(layer.mlp.experts) - if example_moe is None: - raise RuntimeError("No Glm4MoE layer found in model.layers.") - - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) + extract_moe_parameters(self, example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -726,6 +702,23 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for layer in self.model.layers: + if isinstance(layer.mlp, Glm4MoE): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def get_spec_layer_idx_from_weight_name( config: Glm4MoeConfig, weight_name: str @@ -738,3 +731,15 @@ def get_spec_layer_idx_from_weight_name( if f"layers.{layer_idx + i}." in weight_name: return layer_idx + i return None + + +def extract_moe_parameters(model: MixtureOfExperts, example_moe: Glm4MoE) -> None: + if example_moe is None: + raise RuntimeError("No Glm4MoE layer found in model.layers.") + else: + model.num_logical_experts = example_moe.n_logical_experts + model.num_physical_experts = example_moe.n_physical_experts + model.num_local_physical_experts = example_moe.n_local_physical_experts + model.num_routed_experts = example_moe.n_routed_experts + model.num_shared_experts = example_moe.n_shared_experts + model.num_redundant_experts = example_moe.n_redundant_experts diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index ff274ba14790..2f0fcb621f7f 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -41,7 +41,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors -from .glm4_moe import Glm4MoE, Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name +from .glm4_moe import ( + Glm4MoE, + Glm4MoeDecoderLayer, + extract_moe_parameters, + get_spec_layer_idx_from_weight_name, +) from .interfaces import MixtureOfExperts, SupportsPP from .utils import maybe_prefix @@ -198,36 +203,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if isinstance(layer.mlp, Glm4MoE): example_moe = layer.mlp self.moe_layers.append(layer.mlp.experts) - - if example_moe is None: - raise RuntimeError("No Glm4MoE layer found in model.layers.") - - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts + extract_moe_parameters(self, example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index f49923095394..8fa9776bd018 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -1007,7 +1007,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Set MoE hyperparameters self.expert_weights = [] self.num_expert_groups = 1 - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers = [] example_layer = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): @@ -1028,22 +1028,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_routed_experts = example_layer.n_routed_experts self.num_redundant_experts = example_layer.n_redundant_experts - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - self.expert_weights.append(layer.get_expert_weights()) - # Register the expert weights. - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 1bc5f5ae5419..95a29dee6817 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -14,6 +14,7 @@ import numpy as np import torch +import torch.nn as nn from torch import Tensor from transformers import PretrainedConfig from transformers.models.whisper.tokenization_whisper import LANGUAGES @@ -641,6 +642,9 @@ class MixtureOfExperts(Protocol): num_redundant_experts: int """Number of redundant experts in this model.""" + moe_layers: Iterable[nn.Module] + """List of MoE layers in this model.""" + def set_eplb_state( self, expert_load_view: Tensor, @@ -663,7 +667,15 @@ def set_eplb_state( logical_to_physical_map: Mapping from logical to physical experts. logical_replica_count: Count of replicas for each logical expert. """ - ... + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) def update_physical_experts_metadata( self, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index c81f3514f882..02a490e9c7fd 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -707,7 +707,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # Set MoE hyperparameters self.expert_weights = [] - self.moe_layers: list[FusedMoE] = [] + self.moe_layers = [] example_layer = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): @@ -737,22 +737,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 55f21dba8f54..7bf945cc1830 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -739,7 +739,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def set_moe_parameters(self): self.expert_weights = [] - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers = [] example_moe = None for layer in self.model.layers: assert isinstance(layer, Llama4DecoderLayer) @@ -768,22 +768,6 @@ def set_moe_parameters(self): self.num_shared_experts = example_moe.n_shared_experts self.num_redundant_experts = example_moe.n_redundant_experts - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index f314633de0ef..c1f411b6cd2a 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -546,7 +546,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.expert_weights = [] - self.moe_layers: list[FusedMoE] = [] + self.moe_layers = [] example_moe = None for layer in self.model.layers: @@ -572,22 +572,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_expert_groups = 1 self.num_shared_experts = 0 - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index f31579e5cfa8..bea7aaced835 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -797,7 +797,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.expert_weights = [] self.num_expert_groups = config.n_group - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers = [] example_moe = None for layer in self.model.layers: if isinstance(layer, NemotronHMoEDecoderLayer): @@ -814,22 +814,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_shared_experts = example_moe.n_shared_experts self.num_redundant_experts = example_moe.n_redundant_experts - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 59e625cd9a1d..a7e6772bb708 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -665,7 +665,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Set MoE hyperparameters self.expert_weights = [] - self.moe_layers: list[FusedMoE] = [] + self.moe_layers = [] example_layer = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): @@ -688,22 +688,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_routed_experts = example_layer.n_routed_experts self.num_redundant_experts = example_layer.n_redundant_experts - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index da521d1ab18e..a5332006c87a 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -34,7 +34,7 @@ chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, ) -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -1088,8 +1088,57 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params +class QwenNextMixtureOfExperts(MixtureOfExperts): + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for layer in self.model.layers: + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + def set_moe_parameters(self): + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, Qwen3NextDecoderLayer) and isinstance( + layer.mlp, Qwen3NextSparseMoeBlock + ): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError("No Qwen3Next layer found in the model.layers.") + + # Set MoE hyperparameters + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + class Qwen3NextForCausalLM( - nn.Module, HasInnerState, SupportsLoRA, SupportsPP, MixtureOfExperts, IsHybrid + nn.Module, + HasInnerState, + SupportsLoRA, + SupportsPP, + QwenNextMixtureOfExperts, + IsHybrid, ): packed_modules_mapping = { "qkv_proj": [ @@ -1140,63 +1189,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) # Set MoE hyperparameters - self.expert_weights = [] - - self.moe_layers: list[SharedFusedMoE] = [] - example_layer = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - - assert isinstance(layer, Qwen3NextDecoderLayer) - if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): - example_layer = layer.mlp - self.moe_layers.append(layer.mlp.experts) - - if example_layer is None: - raise RuntimeError("No Qwen3Next layer found in the model.layers.") - - self.num_moe_layers = len(self.moe_layers) - self.num_expert_groups = 1 - self.num_shared_experts = 0 - self.num_logical_experts = example_layer.n_logical_experts - self.num_physical_experts = example_layer.n_physical_experts - self.num_local_physical_experts = example_layer.n_local_physical_experts - self.num_routed_experts = example_layer.n_routed_experts - self.num_redundant_experts = example_layer.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for layer in self.model.layers: - if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): - moe = layer.mlp - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() + self.set_moe_parameters() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 394f44ee5457..271b76adcff7 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -23,12 +23,12 @@ from vllm.model_executor.models.qwen3_next import ( Qwen3NextDecoderLayer, Qwen3NextRMSNorm, - Qwen3NextSparseMoeBlock, + QwenNextMixtureOfExperts, ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import Qwen3NextConfig -from .interfaces import MixtureOfExperts, SupportsPP +from .interfaces import SupportsPP from .utils import ( AutoWeightsLoader, is_pp_missing_parameter, @@ -227,7 +227,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @support_torch_compile -class Qwen3NextMTP(nn.Module, SupportsPP, MixtureOfExperts): +class Qwen3NextMTP(nn.Module, SupportsPP, QwenNextMixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -268,64 +268,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.set_moe_parameters() - def set_moe_parameters(self): - self.expert_weights = [] - - self.moe_layers: list[FusedMoE] = [] - example_moe = None - for layer in self.model.layers: - assert isinstance(layer, Qwen3NextDecoderLayer) - if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): - example_moe = layer.mlp - self.moe_layers.append(layer.mlp.experts) - - if example_moe is None: - raise RuntimeError("No Qwen3Next layer found in the model.layers.") - - # Set MoE hyperparameters - self.num_moe_layers = len(self.moe_layers) - self.num_expert_groups = 1 - self.num_shared_experts = 0 - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for layer in self.model.layers: - assert isinstance(layer, Qwen3NextDecoderLayer) - if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): - moe = layer.mlp - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 6880c9f31fbdc8f08721b47040fea0c0da8d6f8b Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 27 Oct 2025 09:38:43 +0000 Subject: [PATCH 18/32] Refactor glm4 Signed-off-by: ilmarkov --- vllm/model_executor/models/deepseek_mtp.py | 1 - vllm/model_executor/models/ernie45_moe.py | 2 +- vllm/model_executor/models/glm4_moe.py | 66 +++++++++++----------- vllm/model_executor/models/glm4_moe_mtp.py | 10 ++-- 4 files changed, 41 insertions(+), 38 deletions(-) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 6c9bba44a217..17b2da4b3dfe 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -190,7 +190,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def set_moe_parameters(self): self.expert_weights = [] - # Set MoE hyperparameters self.num_moe_layers = self.config.num_nextn_predict_layers self.num_expert_groups = self.config.n_group diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 170ff183dab8..e1ecd8f7681e 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -133,7 +133,7 @@ def __init__( self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", None) self.ep_group = get_ep_group().device_group - self.ep_rank = get_ep_group().rank_in_groupn_group + self.ep_rank = get_ep_group().rank_in_group self.ep_size = self.ep_group.size() self.n_routed_experts: int = config.moe_num_experts self.n_shared_experts: int = self.moe_num_shared_experts diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 54412585c6a9..b30bd66161da 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -616,7 +616,35 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params -class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts): +class Glm4MixtureOfExperts(MixtureOfExperts): + def extract_moe_parameters(self, example_moe: Glm4MoE | None) -> None: + if example_moe is None: + raise RuntimeError("No Glm4MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, Glm4MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -659,7 +687,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace self.num_expert_groups = config.n_group - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers = [] + self.moe_mlp_layers: list[Glm4MoE] = [] + example_moe = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): @@ -669,9 +699,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if isinstance(layer.mlp, Glm4MoE): # Pick last one layer since the first ones may be dense layers. example_moe = layer.mlp + self.moe_mlp_layers.append(layer.mlp) self.moe_layers.append(layer.mlp.experts) - extract_moe_parameters(self, example_moe) + self.extract_moe_parameters(example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -702,23 +733,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for layer in self.model.layers: - if isinstance(layer.mlp, Glm4MoE): - moe = layer.mlp - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - def get_spec_layer_idx_from_weight_name( config: Glm4MoeConfig, weight_name: str @@ -731,15 +745,3 @@ def get_spec_layer_idx_from_weight_name( if f"layers.{layer_idx + i}." in weight_name: return layer_idx + i return None - - -def extract_moe_parameters(model: MixtureOfExperts, example_moe: Glm4MoE) -> None: - if example_moe is None: - raise RuntimeError("No Glm4MoE layer found in model.layers.") - else: - model.num_logical_experts = example_moe.n_logical_experts - model.num_physical_experts = example_moe.n_physical_experts - model.num_local_physical_experts = example_moe.n_local_physical_experts - model.num_routed_experts = example_moe.n_routed_experts - model.num_shared_experts = example_moe.n_shared_experts - model.num_redundant_experts = example_moe.n_redundant_experts diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 2f0fcb621f7f..34f6df432ea7 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -42,12 +42,12 @@ from vllm.sequence import IntermediateTensors from .glm4_moe import ( + Glm4MixtureOfExperts, Glm4MoE, Glm4MoeDecoderLayer, - extract_moe_parameters, get_spec_layer_idx_from_weight_name, ) -from .interfaces import MixtureOfExperts, SupportsPP +from .interfaces import SupportsPP from .utils import maybe_prefix @@ -180,7 +180,7 @@ def compute_logits( return logits -class Glm4MoeMTP(nn.Module, SupportsPP, MixtureOfExperts): +class Glm4MoeMTP(nn.Module, SupportsPP, Glm4MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.config = vllm_config.model_config.hf_config @@ -195,6 +195,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_expert_groups = self.config.n_group self.moe_layers: list[FusedMoE] = [] + self.moe_mlp_layers: list[Glm4MoE] = [] example_moe = None for layer in self.model.layers.values(): assert isinstance(layer, Glm4MoeMultiTokenPredictor) @@ -202,8 +203,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): assert isinstance(layer, Glm4MoeDecoderLayer) if isinstance(layer.mlp, Glm4MoE): example_moe = layer.mlp + self.moe_mlp_layers.append(layer.mlp) self.moe_layers.append(layer.mlp.experts) - extract_moe_parameters(self, example_moe) + self.extract_moe_parameters(self, example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 4ab42aa9c564c034f1d9445f88e84afa0c4e8191 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 27 Oct 2025 09:51:22 +0000 Subject: [PATCH 19/32] Update moemixin Signed-off-by: ilmarkov --- vllm/model_executor/models/transformers/moe.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 5de786f99580..58017a73fd3c 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -125,7 +125,7 @@ def set_eplb_state( logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, ): - for moe_layer_idx, mlp_layer in enumerate(self.mlp_layers): + for moe_layer_idx, mlp_layer in enumerate(self.mlp_moe_layers): mlp_layer.experts.set_eplb_state( moe_layer_idx=moe_layer_idx, expert_load_view=expert_load_view, @@ -142,7 +142,7 @@ def update_physical_experts_metadata( self.num_physical_experts = num_physical_experts self.num_local_physical_experts = num_local_physical_experts self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for mlp in self.mlp_layers: + for mlp in self.mlp_moe_layers: mlp.n_local_physical_experts = num_local_physical_experts mlp.n_physical_experts = num_physical_experts mlp.n_redundant_experts = self.num_redundant_experts @@ -240,7 +240,8 @@ def forward(self, *args, **kwargs): # MixtureOfExperts mixin settings ep_size = get_ep_group().world_size - self.mlp_layers = [] # Used for MixtureOfExperts methods + self.mlp_moe_layers = [] # Used for MixtureOfExperts methods + self.moe_layers = [] self.expert_weights = [] self.num_moe_layers = 0 self.num_expert_groups = 1 if num_expert_group is None else num_expert_group @@ -298,7 +299,8 @@ def _recursive_replace(module: nn.Module, prefix: str): mlp.experts = fused_experts log_replacement(qual_name, experts, fused_experts) # Update MixtureOfExperts mixin state - self.mlp_layers.append(mlp) + self.mlp_moe_layers.append(mlp) + self.moe_layers.append(fused_experts) self.expert_weights.append(fused_experts.get_expert_weights()) self.num_moe_layers += 1 # If results are not all-reduced in FusedMoE, ensure they From 7d0ee2833b67a00f53413876262429207ffafc94 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 28 Oct 2025 14:32:30 +0100 Subject: [PATCH 20/32] Update comment for V1 Test e2e + engine Updated timeout for V1 Test e2e + engine from 50 to 35 minutes. --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9fa10db2ed3e..7275cc6c163f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -269,7 +269,7 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test e2e + engine # 50min +- label: V1 Test e2e + engine # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: From d12909756959f3f046ac4ae2f0dd451ad31d6d53 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Mon, 27 Oct 2025 10:07:09 +0000 Subject: [PATCH 21/32] Update startup logging Signed-off-by: ilmarkov --- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9657233bb108..d84defd23922 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2875,14 +2875,14 @@ def load_model(self, eep_scale_up: bool = False) -> None: self.model, self.vllm_config, self.device ) if hasattr(self, "drafter"): - logger.info("Loading drafter model...") + logger.info_once("Loading drafter model...") self.drafter.load_model(self.model) if ( hasattr(self.drafter, "model") and is_mixture_of_experts(self.drafter.model) and self.parallel_config.enable_eplb ): - logger.info( + logger.info_once( "EPLB is enabled for drafter model %s.", self.vllm_config.speculative_config.draft_model_config.model, ) @@ -2942,7 +2942,7 @@ def load_model(self, eep_scale_up: bool = False) -> None: ) if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb: - logger.info("EPLB is enabled for model %s.", self.model_config.model) + logger.info_once("EPLB is enabled for model %s.", self.model_config.model) global_expert_load = ( global_expert_loads[eplb_models] if global_expert_loads else None ) From a77b99f3f8de9e7f677d4026f94709d7be639ded Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 28 Oct 2025 16:30:04 +0000 Subject: [PATCH 22/32] Update test Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 5 +- tests/distributed/test_eplb_spec_decode.py | 97 +++++++++++++++++ tests/v1/e2e/test_eplb_spec_decode.py | 117 --------------------- vllm/distributed/eplb/eplb_state.py | 10 +- 4 files changed, 106 insertions(+), 123 deletions(-) create mode 100644 tests/distributed/test_eplb_spec_decode.py delete mode 100644 tests/v1/e2e/test_eplb_spec_decode.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7275cc6c163f..6b1c2da83f83 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -223,6 +223,7 @@ steps: - tests/distributed/test_eplb_execute.py commands: - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 @@ -269,8 +270,8 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test e2e + engine # 35min - timeout_in_minutes: 50 +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py new file mode 100644 index 000000000000..308757dfd605 --- /dev/null +++ b/tests/distributed/test_eplb_spec_decode.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import lm_eval +import pytest + + +def get_model_args( + model_name: str, + spec_model_name: str, + spec_method: str, + tp_size: int, + model_max_len: int, +) -> dict: + speculative_config = { + "method": spec_method, + "model": spec_model_name, + "num_speculative_tokens": 4, + "max_model_len": model_max_len, + } + + model_args = { + "pretrained": model_name, + "dtype": "auto", + "add_bos_token": True, + "tensor_parallel_size": tp_size, + "gpu_memory_utilization": 0.7, + "speculative_config": speculative_config, + "enable_expert_parallel": True, + "num_redundant_experts": tp_size, + "eplb_window_size": 32, + "eplb_step_interval": 128, + "eplb_log_balancedness": False, + "enable_eplb": True, + "max_model_len": model_max_len, + } + return model_args + + +@pytest.mark.parametrize( + "model_setup", + [ + pytest.param( + ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86), + ), + pytest.param( + ( + "eagle", + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", + 4, + 0.86, + ), + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"), + ), + ], + ids=["qwen3_next_mtp", "llama4_eagle"], +) +def test_eplb_spec_decode( + monkeypatch: pytest.MonkeyPatch, + model_setup: tuple[str, str, str, int, float], +): + """ + Test the correctness of EPLB speculative decoding with GSM8K dataset. + Applicable to MoE models with mtp or eagle spec decode. + """ + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_MLA_DISABLE", "1") + + method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup + + TASK = "gsm8k" + FILTER = "exact_match,strict-match" + RTOL = 0.03 + + model_args = get_model_args( + model_name=model_name, + spec_model_name=spec_model_name, + spec_method=method, + tp_size=tp_size, + model_max_len=4096, + ) + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=TASK, + batch_size=64, + num_fewshot=8, + ) + measured_value = results["results"][TASK][FILTER] + assert ( + measured_value - RTOL < expected_gsm8k_value + and measured_value + RTOL > expected_gsm8k_value + ), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}" diff --git a/tests/v1/e2e/test_eplb_spec_decode.py b/tests/v1/e2e/test_eplb_spec_decode.py deleted file mode 100644 index aa094d262cc4..000000000000 --- a/tests/v1/e2e/test_eplb_spec_decode.py +++ /dev/null @@ -1,117 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - -import pytest -import torch - -from vllm import LLM, SamplingParams -from vllm.distributed import cleanup_dist_env_and_memory - - -def create_test_prompts() -> list[str]: - return [ - "A robot may not injure a human being", - "To be or not to be,", - "What is the meaning of life?", - ] - - -@pytest.fixture -def sampling_config(): - return SamplingParams(temperature=0, max_tokens=64, ignore_eos=False) - - -def check_outputs(ref_outputs, spec_outputs): - matches = 0 - misses = 0 - for ref_output, spec_output in zip(ref_outputs, spec_outputs): - if ref_output.outputs[0].text == spec_output.outputs[0].text: - matches += 1 - else: - misses += 1 - print(f"ref_output: {ref_output.outputs[0].text}") - print(f"spec_output: {spec_output.outputs[0].text}") - - # Heuristic: expect at least 66% of the prompts to match exactly - # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.66 * len(ref_outputs)) - - -@pytest.mark.parametrize( - "model_setup", - [ - ( - "eagle", - "eagle618/deepseek-v3-random", - "eagle618/eagle-deepseek-v3-random", - 4, - ), - pytest.param( - "deepseek_mtp", - "eagle618/deepseek-v3-random", - None, - 2, - marks=pytest.mark.skip(reason="Skipping for CI test time savings"), - ), - pytest.param( - "qwen3_next_mtp", - "Qwen/Qwen3-Next-80B-A3B-Instruct", - None, - 2, - marks=pytest.mark.skip(reason="Skipping for CI test time savings"), - ), - pytest.param( - ( - "eagle", - "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", - 4, - ), - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"), - ), - ], - ids=["deepseek_eagle", "deepseek_mtp", "qwen3_next_mtp", "llama4_eagle"], -) -def test_eplb_spec_decode( - monkeypatch: pytest.MonkeyPatch, - sampling_config: SamplingParams, - model_setup: tuple[str, str, str, int], -): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - m.setenv("VLLM_MLA_DISABLE", "1") - - method, model_name, spec_model_name, tp_size = model_setup - spec_llm = LLM( - model=model_name, - trust_remote_code=True, - tensor_parallel_size=tp_size, - speculative_config={ - "method": method, - "model": spec_model_name, - "num_speculative_tokens": 4, - "max_model_len": 2048, - }, - max_model_len=2048, - enable_expert_parallel=True, - num_redundant_experts=tp_size, - eplb_window_size=8, - eplb_step_interval=32, - eplb_log_balancedness=True, - enable_eplb=True, - ) - test_prompts = create_test_prompts() - spec_outputs = spec_llm.generate(test_prompts, sampling_config) - del spec_llm - torch.cuda.empty_cache() - cleanup_dist_env_and_memory() - - ref_llm = LLM( - model=model_name, max_model_len=2048, tensor_parallel_size=tp_size - ) - ref_outputs = ref_llm.generate(test_prompts, sampling_config) - check_outputs(ref_outputs, spec_outputs) - del ref_llm - torch.cuda.empty_cache() - cleanup_dist_env_and_memory() diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 9d5601b43837..b235dd47c6eb 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -463,8 +463,9 @@ def step( if ep_group.rank() == 0: logger.info( - "EPLB step for model %s: avg_tokens=%.2f, " + "EPLB step: %d for model %s: avg_tokens=%.2f, " "max_tokens=%d, balancedness=%.4f", + self.expert_rearrangement_step, eplb_model_state.model_name, avg_tokens, max_tokens, @@ -506,14 +507,15 @@ def rearrange( is_profile (bool): If `True`, perform a dummy rearrangement. This is used in `profile_run` to reserve enough memory, no memory movement will be performed. Default is False. - execute_shuffle (bool): If `True`, execute the shuffle in eep. + execute_shuffle (bool): If `True`, execute the shuffle + in elastic expert parallel (EEP). Default is True. global_expert_loads (list[torch.Tensor] | None): - The global expert loads when scaling is done in eep. + The global expert loads when scaling is done in EEP. List of expert loads for the main and drafter (when spec decode is used) models. rank_mapping (dict[int, int] | None): The rank mapping when scaling - is done in eep. + is done in EEP. """ ep_group = get_ep_group().device_group From ef3c9a1f325d7fd76a98a8f93da4be7088293f4a Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 28 Oct 2025 16:49:28 +0000 Subject: [PATCH 23/32] Upd test constants Signed-off-by: ilmarkov --- tests/distributed/test_eplb_spec_decode.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index 308757dfd605..345b80ff9f00 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -29,8 +29,8 @@ def get_model_args( "speculative_config": speculative_config, "enable_expert_parallel": True, "num_redundant_experts": tp_size, - "eplb_window_size": 32, - "eplb_step_interval": 128, + "eplb_window_size": 128, + "eplb_step_interval": 1024, "eplb_log_balancedness": False, "enable_eplb": True, "max_model_len": model_max_len, @@ -50,7 +50,7 @@ def get_model_args( "meta-llama/Llama-4-Scout-17B-16E-Instruct", "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4, - 0.86, + 0.92, ), marks=pytest.mark.skip(reason="Skipping due to CI OOM issues"), ), From 7e60b2670213cc8a702645d60f32898dd25e6c0b Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 28 Oct 2025 17:14:09 +0000 Subject: [PATCH 24/32] Upd test time Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6b1c2da83f83..f08a2e5ac2f9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -214,8 +214,8 @@ steps: commands: - pytest -v -s distributed/test_eplb_algo.py -- label: EPLB Execution Test # 5min - timeout_in_minutes: 15 +- label: EPLB Execution Test # 10min + timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: From f4fad37f4920e58f1a1df553fb5ef4074eb285ba Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Wed, 29 Oct 2025 20:44:06 +0000 Subject: [PATCH 25/32] Upd Signed-off-by: ilmarkov --- tests/distributed/test_eplb_spec_decode.py | 50 ++++++++++------------ vllm/distributed/eplb/eplb_state.py | 20 ++++----- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index 345b80ff9f00..59b7ae367e15 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -65,33 +65,29 @@ def test_eplb_spec_decode( Test the correctness of EPLB speculative decoding with GSM8K dataset. Applicable to MoE models with mtp or eagle spec decode. """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - m.setenv("VLLM_MLA_DISABLE", "1") + method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup - method, model_name, spec_model_name, tp_size, expected_gsm8k_value = model_setup + TASK = "gsm8k" + FILTER = "exact_match,strict-match" + RTOL = 0.03 - TASK = "gsm8k" - FILTER = "exact_match,strict-match" - RTOL = 0.03 + model_args = get_model_args( + model_name=model_name, + spec_model_name=spec_model_name, + spec_method=method, + tp_size=tp_size, + model_max_len=4096, + ) - model_args = get_model_args( - model_name=model_name, - spec_model_name=spec_model_name, - spec_method=method, - tp_size=tp_size, - model_max_len=4096, - ) - - results = lm_eval.simple_evaluate( - model="vllm", - model_args=model_args, - tasks=TASK, - batch_size=64, - num_fewshot=8, - ) - measured_value = results["results"][TASK][FILTER] - assert ( - measured_value - RTOL < expected_gsm8k_value - and measured_value + RTOL > expected_gsm8k_value - ), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}" + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=TASK, + batch_size=64, + num_fewshot=8, + ) + measured_value = results["results"][TASK][FILTER] + assert ( + measured_value - RTOL < expected_gsm8k_value + and measured_value + RTOL > expected_gsm8k_value + ), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}" diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index b235dd47c6eb..526d3ceac7b8 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -405,7 +405,6 @@ def step( Step the EPLB state. Args: - model (MixtureOfExperts): The MoE model. is_dummy (bool): If `True`, this is a dummy step and the load metrics recorded in this forward pass will not count. Defaults to `False`. @@ -505,17 +504,16 @@ def rearrange( Args: is_profile (bool): If `True`, perform a dummy rearrangement. - This is used in `profile_run` to reserve enough memory, - no memory movement will be performed. Default is False. + This is used in `profile_run` to reserve enough memory, + no memory movement will be performed. Default is False. execute_shuffle (bool): If `True`, execute the shuffle - in elastic expert parallel (EEP). - Default is True. - global_expert_loads (list[torch.Tensor] | None): - The global expert loads when scaling is done in EEP. - List of expert loads for the main and drafter - (when spec decode is used) models. - rank_mapping (dict[int, int] | None): The rank mapping when scaling - is done in EEP. + in elastic expert parallel (EEP). Default is True. + global_expert_loads (list[torch.Tensor] | None): The global expert + loads when scaling is done in EEP. + List of expert loads for the main and drafter + (when spec decode is used) models. + rank_mapping (dict[int, int] | None): The rank mapping + when scaling is done in EEP. """ ep_group = get_ep_group().device_group From 94e33902156a0e84bdc16fee6fa02195340ec7cd Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Fri, 31 Oct 2025 10:28:42 +0000 Subject: [PATCH 26/32] Fix glm4moe Signed-off-by: ilmarkov --- vllm/model_executor/models/glm4_moe_mtp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 34f6df432ea7..9a2ae3c476f0 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -29,7 +29,7 @@ import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -78,6 +78,7 @@ def __init__( prefix: str, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, + parallel_config: ParallelConfig | None = None, ) -> None: super().__init__() self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -86,11 +87,13 @@ def __init__( self.shared_head = SharedHead( config=config, prefix=prefix, quant_config=quant_config ) + self.enable_eplb = parallel_config.enable_eplb self.mtp_block = Glm4MoeDecoderLayer( config=config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, + enable_eplb=self.enable_eplb, ) def forward( @@ -132,6 +135,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"{prefix}.layers.{idx}", cache_config=vllm_config.cache_config, quant_config=vllm_config.quant_config, + parallel_config=vllm_config.parallel_config, ) for idx in range( self.mtp_start_layer_idx, @@ -198,14 +202,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.moe_mlp_layers: list[Glm4MoE] = [] example_moe = None for layer in self.model.layers.values(): - assert isinstance(layer, Glm4MoeMultiTokenPredictor) + assert isinstance(layer, Glm4MoeMultiTokenPredictorLayer) layer = layer.mtp_block assert isinstance(layer, Glm4MoeDecoderLayer) if isinstance(layer.mlp, Glm4MoE): example_moe = layer.mlp self.moe_mlp_layers.append(layer.mlp) self.moe_layers.append(layer.mlp.experts) - self.extract_moe_parameters(self, example_moe) + self.extract_moe_parameters(example_moe) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 09f98691fc7f1e880ef9f4e129da3f61b948689c Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Fri, 31 Oct 2025 15:49:43 +0000 Subject: [PATCH 27/32] Fix CI Signed-off-by: ilmarkov --- tests/distributed/test_eplb_spec_decode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index 59b7ae367e15..e9d741cb01e3 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -25,7 +25,7 @@ def get_model_args( "dtype": "auto", "add_bos_token": True, "tensor_parallel_size": tp_size, - "gpu_memory_utilization": 0.7, + "gpu_memory_utilization": 0.9, "speculative_config": speculative_config, "enable_expert_parallel": True, "num_redundant_experts": tp_size, @@ -42,7 +42,7 @@ def get_model_args( "model_setup", [ pytest.param( - ("qwen3_next_mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86), + ("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86), ), pytest.param( ( From 74f806b94639627e50677a1ed8868585db9fa477 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Fri, 31 Oct 2025 20:13:55 +0100 Subject: [PATCH 28/32] Update gpu_memory_utilization to 0.93 --- tests/distributed/test_eplb_spec_decode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index e9d741cb01e3..145f6241d274 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -25,7 +25,7 @@ def get_model_args( "dtype": "auto", "add_bos_token": True, "tensor_parallel_size": tp_size, - "gpu_memory_utilization": 0.9, + "gpu_memory_utilization": 0.93, "speculative_config": speculative_config, "enable_expert_parallel": True, "num_redundant_experts": tp_size, From 0e8dc736707b61fe8c83c6863c4e0127fa10dc89 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Sun, 2 Nov 2025 21:14:19 +0000 Subject: [PATCH 29/32] Fix Signed-off-by: ilmarkov --- tests/distributed/test_eplb_spec_decode.py | 3 +++ vllm/model_executor/models/llama4.py | 10 +++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index 145f6241d274..c14388872fa0 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -5,6 +5,8 @@ import lm_eval import pytest +from tests.utils import large_gpu_mark + def get_model_args( model_name: str, @@ -43,6 +45,7 @@ def get_model_args( [ pytest.param( ("mtp", "Qwen/Qwen3-Next-80B-A3B-Instruct", None, 4, 0.86), + marks=large_gpu_mark(min_gb=80), ), pytest.param( ( diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 7bf945cc1830..a7e0732ec71e 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -511,8 +511,6 @@ def load_moe_expert_weights( .flatten() .to(new_loaded_weight.device) ) - # Take redundant experts into account - local_expert_indices %= new_loaded_weight.shape[0] new_loaded_weight = new_loaded_weight[local_expert_indices] expert_id = local_expert_indices[0].item() else: @@ -521,17 +519,15 @@ def load_moe_expert_weights( # Load the weight into the module parameter with corresponding # shard id and expert id. - success = weight_loader( + weight_loader( param, new_loaded_weight, full_param_name, shard_id=shard_id, expert_id=expert_id, - return_success=True, ) - if success: - loaded_params.add(full_param_name) - expert_param_loaded = True + loaded_params.add(full_param_name) + expert_param_loaded = True return expert_param_loaded From b88f680e30dccfc5ce0e194000adb40c49e05f6f Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Sun, 2 Nov 2025 21:48:01 +0000 Subject: [PATCH 30/32] Fix oom Signed-off-by: ilmarkov --- tests/distributed/test_eplb_spec_decode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index c14388872fa0..11e23f128f33 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -18,7 +18,7 @@ def get_model_args( speculative_config = { "method": spec_method, "model": spec_model_name, - "num_speculative_tokens": 4, + "num_speculative_tokens": 1, "max_model_len": model_max_len, } @@ -27,7 +27,7 @@ def get_model_args( "dtype": "auto", "add_bos_token": True, "tensor_parallel_size": tp_size, - "gpu_memory_utilization": 0.93, + "gpu_memory_utilization": 0.7, "speculative_config": speculative_config, "enable_expert_parallel": True, "num_redundant_experts": tp_size, From e4fa2419041156cf70b6ea866c8af91b023e22c0 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 4 Nov 2025 17:32:24 +0000 Subject: [PATCH 31/32] Update moe_layers. Clean OpenPangu Signed-off-by: ilmarkov --- vllm/model_executor/models/openpangu.py | 18 +----------------- vllm/model_executor/models/qwen3_next.py | 4 ++-- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 457498d995f8..bf1b7570a882 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -1009,7 +1009,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_moe_layers = config.num_hidden_layers - config.first_k_dense_replace self.num_expert_groups = 1 - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers = [] example_moe = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): @@ -1031,22 +1031,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.n_shared_experts = example_moe.n_shared_experts self.num_redundant_experts = example_moe.n_redundant_experts - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - def update_physical_experts_metadata( self, num_physical_experts: int, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 8e90a2c53b1b..fbfd02d08d08 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -33,7 +33,7 @@ chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, ) -from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE +from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import ( GemmaRMSNorm as Qwen3NextRMSNorm, ) @@ -1150,7 +1150,7 @@ def update_physical_experts_metadata( def set_moe_parameters(self): self.expert_weights = [] - self.moe_layers: list[FusedMoE] = [] + self.moe_layers = [] example_moe = None for layer in self.model.layers: if isinstance(layer, Qwen3NextDecoderLayer) and isinstance( From deb21b15b0947ce14a7b08ef877bdb5b36cfcfbc Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Tue, 4 Nov 2025 17:36:50 +0000 Subject: [PATCH 32/32] Fix mypy Signed-off-by: ilmarkov --- vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 8d520f5bf8ef..950139c69c29 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -226,7 +226,7 @@ def _decorator(obj: type[ToolParser]) -> type[ToolParser]: if isinstance(name, str): names = [name] - elif is_list_of(name, str): + elif name is not None and is_list_of(name, str): names = name else: names = [class_name]