From e11b28a1e1c45e4b6b7c8bd380d705b1baa98b35 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 13:43:03 +0800 Subject: [PATCH 01/44] init Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- benchmarks/kernels/benchmark_moe.py | 24 +- tests/models/registry.py | 1 + vllm/model_executor/models/glm4_moe.py | 664 ++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/configs/ovis.py | 2 +- 5 files changed, 675 insertions(+), 17 deletions(-) create mode 100644 vllm/model_executor/models/glm4_moe.py diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 07af58d81c68..7ab63eaffba0 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -563,22 +563,14 @@ def main(args: argparse.Namespace): if args.model_prefix: config = getattr(config, args.model_prefix) - if config.architectures[0] == "DbrxForCausalLM": - E = config.ffn_config.moe_num_experts - topk = config.ffn_config.moe_top_k - intermediate_size = config.ffn_config.ffn_hidden_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] == "JambaForCausalLM": - E = config.num_experts - topk = config.num_experts_per_tok - intermediate_size = config.intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"): - E = config.n_routed_experts - topk = config.num_experts_per_tok - intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): + if config.architectures[0] in ( + "DbrxForCausalLM", + "JambaForCausalLM", + "DeepseekV3ForCausalLM", + "DeepseekV2ForCausalLMQwen2MoeForCausalLM", + "Qwen3MoeForCausalLM", + "Glm4MoeForCausalLM", + ): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size diff --git a/tests/models/registry.py b/tests/models/registry.py index 04fff03862fc..7f5b2b96cf87 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -348,6 +348,7 @@ def check_available_online( trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("/model/GLM-4-MoE-100B-A10B", min_transformers_version="4.54"), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py new file mode 100644 index 000000000000..dd9f738aeebd --- /dev/null +++ b/vllm/model_executor/models/glm4_moe.py @@ -0,0 +1,664 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The ZhipuAI Team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-4-MOE model compatible with HuggingFace weights.""" + +from collections.abc import Iterable +from typing import Any, Optional, Union + +import torch +from torch import nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsPP +from .utils import (AutoWeightsLoader, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + +logger = init_logger(__name__) + + +class Glm4MoeMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class Glm4MoeTopkRouter(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.n_group = config.n_group + self.topk_group = config.topk_group + self.norm_topk_prob = config.norm_topk_prob + + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size))) + self.register_buffer("e_score_correction_bias",torch.zeros((self.n_routed_experts))) + + @torch.no_grad() + def get_topk_indices(self, scores): + scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) + + group_scores = ( + scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) + + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] + group_mask = torch.zeros_like(group_scores) + group_mask.scatter_(1, group_idx, 1) + + score_mask = ( + group_mask.unsqueeze(-1) + .expand(-1, self.n_group, self.n_routed_experts // self.n_group) + .reshape(-1, self.n_routed_experts) + ) + + scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) + topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1] + return topk_indices + + def forward(self, hidden_states): + hidden_states = hidden_states.view(-1, self.config.hidden_size) + router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32)) + scores = router_logits.sigmoid() + + topk_indices = self.get_topk_indices(scores) + topk_weights = scores.gather(1, topk_indices) + + if self.norm_topk_prob: + denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 + topk_weights /= denominator + + topk_weights = topk_weights * self.routed_scaling_factor + return topk_indices, topk_weights + + +class Glm4MoeSparseMoeBlock(nn.Module): + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.config = config + self.num_experts = config.n_routed_experts + self.top_k = config.num_experts_per_tok + self.norm_topk_prob = config.norm_topk_prob + + if self.tp_size > self.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {self.num_experts}.") + + self.gate = Glm4MoeTopkRouter( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.gate" + ) + + self.experts = FusedMoE( + num_experts=self.num_experts, + top_k=self.top_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=self.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts" + ) + + self.shared_experts = Glm4MoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size * config.n_shared_experts, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_experts" + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + + residuals = hidden_states + orig_shape = hidden_states.shape + hidden_dim = hidden_states.shape[-1] + hidden_states = hidden_states.view(-1, hidden_dim) + topk_indices, topk_weights = self.gate(hidden_states) + batch_size = hidden_states.shape[0] + router_logits = torch.zeros( + batch_size, self.num_experts, + device=hidden_states.device, + dtype=hidden_states.dtype + ) + + for i in range(batch_size): + router_logits[i, topk_indices[i]] = topk_weights[i] + routed_output = self.experts( + hidden_states=hidden_states, + router_logits=router_logits + ) + + if self.tp_size > 1: + routed_output = self.experts.maybe_all_reduce_tensor_model_parallel( + routed_output + ) + + shared_output = self.shared_experts(residuals.view(-1, hidden_dim)) + final_output = routed_output + shared_output + + return final_output.view(orig_shape) + + +class Glm4MoeAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + head_dim: Optional[int] = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + add_qk_norm: bool = False, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim ** -0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + self.add_qk_norm = add_qk_norm + + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") + + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + if self.add_qk_norm: + self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + if self.add_qk_norm: + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim) + q_by_head = self.q_norm(q_by_head) + q = q_by_head.view(q.shape) + + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim) + k_by_head = self.k_norm(k_by_head) + k = k_by_head.view(k.shape) + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class Glm4MoeDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + + self.self_attn = Glm4MoeAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, 'attention_bias', False), + head_dim=getattr(config, 'head_dim', None), + add_qk_norm=getattr(config, 'add_qk_norm', False), # Add this + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + layer_idx = extract_layer_index(prefix) + if layer_idx >= getattr(config, "first_k_dense_replace", 1): + self.mlp = Glm4MoeSparseMoeBlock( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp" + ) + else: + self.mlp = Glm4MoeMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp" + ) + + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile +class Glm4MoeModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + prefix=f"{prefix}.embed_tokens" + ) + + num_layers = config.num_hidden_layers + if hasattr(config, 'num_nextn_predict_layers'): + num_layers = config.num_hidden_layers - config.num_nextn_predict_layers + + self.start_layer, self.end_layer, self.layers = make_layers( + num_layers, + lambda prefix: Glm4MoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix + ), + prefix=f"{prefix}.layers", + ) + + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Skip loading extra parameters for GPTQ/modelopt models. + ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale", + ".v_scale", "_v_scale", ".weight_scale", + "_weight_scale", ".input_scale", "_input_scale") + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts + ) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + if "gate.weight" in name and "experts" not in name: + if is_pp_missing_parameter(name, self): + continue + if name in params_dict: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if "mlp.experts" in name and "shared_experts" not in name: + continue + + name = name.replace(weight_name, param_name) + + # Skip loading extra parameters for GPTQ/modelopt models. + if name.endswith(ignore_suffixes) and name not in params_dict: + continue + + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name) + break + else: + if "mlp.experts" in name and "shared_experts" not in name: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + if name.endswith(ignore_suffixes) and name not in params_dict: + continue + if name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, name, + shard_id=shard_id, expert_id=expert_id) + loaded_params.add(name) + break + else: + # Handle other parameters + if name.endswith(ignore_suffixes) and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + # Remapping for FP8 kv-scale + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace(".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", + name, remapped_kv_scale_name, + ) + continue + else: + name = remapped_kv_scale_name + + if name not in params_dict: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params + + +class Glm4MoeForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + + self.model = Glm4MoeModel( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model") + ) + + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config + ) + + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 27d476929855..c8d17d23d239 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -65,6 +65,7 @@ "Gemma3nForConditionalGeneration": ("gemma3n", "Gemma3nForConditionalGeneration"), # noqa: E501 "GlmForCausalLM": ("glm", "GlmForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), + "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py index c2728f0ed64c..db6050fac57c 100644 --- a/vllm/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -73,7 +73,7 @@ def __init__( IMAGE_ATOM_ID = -300 IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] -AutoConfig.register("aimv2", AIMv2Config) +# AutoConfig.register("aimv2", AIMv2Config) # ---------------------------------------------------------------------- From 818db5926c023d455fe89cf2811a7e695dae47d4 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 13:45:43 +0800 Subject: [PATCH 02/44] ovis aimv2 model type need changed, mark Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- tests/models/registry.py | 2 +- vllm/transformers_utils/configs/ovis.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 7f5b2b96cf87..f3340acf040b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -348,7 +348,7 @@ def check_available_online( trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 - "Glm4MoeForCausalLM": _HfExamplesInfo("/model/GLM-4-MoE-100B-A10B", min_transformers_version="4.54"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4-MoE-100B-A10B", min_transformers_version="4.54"), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py index db6050fac57c..c2728f0ed64c 100644 --- a/vllm/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -73,7 +73,7 @@ def __init__( IMAGE_ATOM_ID = -300 IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] -# AutoConfig.register("aimv2", AIMv2Config) +AutoConfig.register("aimv2", AIMv2Config) # ---------------------------------------------------------------------- From c6b8eb652c31cec3913d775efe04a73d5752505a Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 14:01:25 +0800 Subject: [PATCH 03/44] format Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_moe.py | 394 +++++++++++++------------ 1 file changed, 201 insertions(+), 193 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index dd9f738aeebd..832fff897fce 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -27,8 +27,8 @@ from typing import Any, Optional, Union import torch -from torch import nn import torch.nn.functional as F +from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention @@ -63,13 +63,13 @@ class Glm4MoeMLP(nn.Module): def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True, - prefix: str = "", + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( @@ -97,70 +97,81 @@ def forward(self, x): class Glm4MoeTopkRouter(nn.Module): - def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.config = config - self.top_k = config.num_experts_per_tok - self.n_routed_experts = config.n_routed_experts - self.routed_scaling_factor = config.routed_scaling_factor - self.n_group = config.n_group - self.topk_group = config.topk_group - self.norm_topk_prob = config.norm_topk_prob - - self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size))) - self.register_buffer("e_score_correction_bias",torch.zeros((self.n_routed_experts))) - - @torch.no_grad() - def get_topk_indices(self, scores): - scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) - - group_scores = ( - scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) - .topk(2, dim=-1)[0] - .sum(dim=-1) - ) - - group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] - group_mask = torch.zeros_like(group_scores) - group_mask.scatter_(1, group_idx, 1) - - score_mask = ( - group_mask.unsqueeze(-1) - .expand(-1, self.n_group, self.n_routed_experts // self.n_group) - .reshape(-1, self.n_routed_experts) - ) - - scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) - topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1] - return topk_indices - - def forward(self, hidden_states): - hidden_states = hidden_states.view(-1, self.config.hidden_size) - router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32)) - scores = router_logits.sigmoid() - - topk_indices = self.get_topk_indices(scores) - topk_weights = scores.gather(1, topk_indices) - - if self.norm_topk_prob: - denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 - topk_weights /= denominator - - topk_weights = topk_weights * self.routed_scaling_factor - return topk_indices, topk_weights + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.n_group = config.n_group + self.topk_group = config.topk_group + self.norm_topk_prob = config.norm_topk_prob + + self.weight = nn.Parameter( + torch.empty((self.n_routed_experts, config.hidden_size))) + self.register_buffer( + "e_score_correction_bias", + torch.zeros((self.n_routed_experts), dtype=torch.float32)) + + @torch.no_grad() + def get_topk_indices(self, scores): + scores_for_choice = scores.view( + -1, + self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) + + group_scores = (scores_for_choice.view( + -1, self.n_group, + self.n_routed_experts // self.n_group).topk(2, + dim=-1)[0].sum(dim=-1)) + + group_idx = torch.topk(group_scores, + k=self.topk_group, + dim=-1, + sorted=False)[1] + group_mask = torch.zeros_like(group_scores) + group_mask.scatter_(1, group_idx, 1) + + score_mask = (group_mask.unsqueeze(-1).expand( + -1, self.n_group, self.n_routed_experts // self.n_group).reshape( + -1, self.n_routed_experts)) + + scores_for_choice = scores_for_choice.masked_fill( + ~score_mask.bool(), 0.0) + topk_indices = torch.topk(scores_for_choice, + k=self.top_k, + dim=-1, + sorted=False)[1] + return topk_indices + + def forward(self, hidden_states): + hidden_states = hidden_states.view(-1, self.config.hidden_size) + router_logits = F.linear(hidden_states.type(torch.float32), + self.weight.type(torch.float32)) + scores = router_logits.sigmoid() + + topk_indices = self.get_topk_indices(scores) + topk_weights = scores.gather(1, topk_indices) + + if self.norm_topk_prob: + denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 + topk_weights /= denominator + + topk_weights = topk_weights * self.routed_scaling_factor + return topk_indices, topk_weights class Glm4MoeSparseMoeBlock(nn.Module): + def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() @@ -174,31 +185,27 @@ def __init__( f"Tensor parallel size {self.tp_size} is greater than " f"the number of experts {self.num_experts}.") - self.gate = Glm4MoeTopkRouter( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.gate" - ) + self.gate = Glm4MoeTopkRouter(config=config, + quant_config=quant_config, + prefix=f"{prefix}.gate") - self.experts = FusedMoE( - num_experts=self.num_experts, - top_k=self.top_k, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=self.norm_topk_prob, - quant_config=quant_config, - prefix=f"{prefix}.experts" - ) + self.experts = FusedMoE(num_experts=self.num_experts, + top_k=self.top_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=self.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts") self.shared_experts = Glm4MoeMLP( hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size * config.n_shared_experts, + intermediate_size=config.moe_intermediate_size * + config.n_shared_experts, hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=False, - prefix=f"{prefix}.shared_experts" - ) + prefix=f"{prefix}.shared_experts") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -208,23 +215,19 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view(-1, hidden_dim) topk_indices, topk_weights = self.gate(hidden_states) batch_size = hidden_states.shape[0] - router_logits = torch.zeros( - batch_size, self.num_experts, - device=hidden_states.device, - dtype=hidden_states.dtype - ) + router_logits = torch.zeros(batch_size, + self.num_experts, + device=hidden_states.device, + dtype=hidden_states.dtype) for i in range(batch_size): router_logits[i, topk_indices[i]] = topk_weights[i] - routed_output = self.experts( - hidden_states=hidden_states, - router_logits=router_logits - ) + routed_output = self.experts(hidden_states=hidden_states, + router_logits=router_logits) if self.tp_size > 1: routed_output = self.experts.maybe_all_reduce_tensor_model_parallel( - routed_output - ) + routed_output) shared_output = self.shared_experts(residuals.view(-1, hidden_dim)) final_output = routed_output + shared_output @@ -235,20 +238,20 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Glm4MoeAttention(nn.Module): def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[dict[str, Any]] = None, - max_position_embeddings: int = 8192, - head_dim: Optional[int] = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - add_qk_norm: bool = False, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + head_dim: Optional[int] = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + add_qk_norm: bool = False, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -269,7 +272,7 @@ def __init__( self.head_dim = head_dim or (hidden_size // self.total_num_heads) self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.add_qk_norm = add_qk_norm @@ -308,19 +311,21 @@ def __init__( self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.add_qk_norm: - q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim) + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, + self.head_dim) q_by_head = self.q_norm(q_by_head) q = q_by_head.view(q.shape) - k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim) + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, + self.head_dim) k_by_head = self.k_norm(k_by_head) k = k_by_head.view(k.shape) @@ -333,17 +338,18 @@ def forward( class Glm4MoeDecoderLayer(nn.Module): def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) self.self_attn = Glm4MoeAttention( hidden_size=self.hidden_size, @@ -355,7 +361,7 @@ def __init__( rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, 'attention_bias', False), head_dim=getattr(config, 'head_dim', None), - add_qk_norm=getattr(config, 'add_qk_norm', False), # Add this + add_qk_norm=getattr(config, 'add_qk_norm', False), cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -363,41 +369,41 @@ def __init__( layer_idx = extract_layer_index(prefix) if layer_idx >= getattr(config, "first_k_dense_replace", 1): - self.mlp = Glm4MoeSparseMoeBlock( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp" - ) + self.mlp = Glm4MoeSparseMoeBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") else: - self.mlp = Glm4MoeMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - prefix=f"{prefix}.mlp" - ) + self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: Optional[torch.Tensor], + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states, residual = self.input_layernorm( + hidden_states, residual) hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, ) - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual @@ -419,8 +425,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, - prefix=f"{prefix}.embed_tokens" - ) + prefix=f"{prefix}.embed_tokens") num_layers = config.num_hidden_layers if hasattr(config, 'num_nextn_predict_layers'): @@ -428,31 +433,27 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( num_layers, - lambda prefix: Glm4MoeDecoderLayer( - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix - ), + lambda prefix: Glm4MoeDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.layers", ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) - ) + ["hidden_states", "residual"], config.hidden_size)) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: if inputs_embeds is not None: @@ -478,7 +479,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -499,8 +501,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", - num_experts=self.config.n_routed_experts - ) + num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() @@ -511,7 +512,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: continue if name in params_dict: param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) continue @@ -555,31 +557,39 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: name = name.replace(weight_name, param_name) if is_pp_missing_parameter(name, self): continue - if name.endswith(ignore_suffixes) and name not in params_dict: + if name.endswith( + ignore_suffixes) and name not in params_dict: continue if name not in params_dict: continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, loaded_weight, name, - shard_id=shard_id, expert_id=expert_id) + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) loaded_params.add(name) break else: # Handle other parameters - if name.endswith(ignore_suffixes) and name not in params_dict: + if name.endswith( + ignore_suffixes) and name not in params_dict: continue + # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue # Remapping for FP8 kv-scale if name.endswith("kv_scale"): - remapped_kv_scale_name = name.replace(".kv_scale", ".attn.kv_scale") + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: logger.warning_once( "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", - name, remapped_kv_scale_name, + name, + remapped_kv_scale_name, ) continue else: @@ -589,7 +599,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) @@ -618,47 +629,44 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.quant_config = quant_config - self.model = Glm4MoeModel( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model") - ) + self.model = Glm4MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead( - config.vocab_size, - config.hidden_size, - quant_config=quant_config - ) + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) + self.model.make_empty_intermediate_tensors) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) From f406a09a006e3861b082c1368ff81efcd78fb57a Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 14:28:10 +0800 Subject: [PATCH 04/44] format Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_moe.py | 146 +++++++----------------- vllm/transformers_utils/configs/ovis.py | 2 +- 2 files changed, 44 insertions(+), 104 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 832fff897fce..cbeee1e22f9d 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -27,7 +27,6 @@ from typing import Any, Optional, Union import torch -import torch.nn.functional as F from torch import nn from transformers import PretrainedConfig @@ -41,6 +40,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -95,76 +95,6 @@ def forward(self, x): return x -class Glm4MoeTopkRouter(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.config = config - self.top_k = config.num_experts_per_tok - self.n_routed_experts = config.n_routed_experts - self.routed_scaling_factor = config.routed_scaling_factor - self.n_group = config.n_group - self.topk_group = config.topk_group - self.norm_topk_prob = config.norm_topk_prob - - self.weight = nn.Parameter( - torch.empty((self.n_routed_experts, config.hidden_size))) - self.register_buffer( - "e_score_correction_bias", - torch.zeros((self.n_routed_experts), dtype=torch.float32)) - - @torch.no_grad() - def get_topk_indices(self, scores): - scores_for_choice = scores.view( - -1, - self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) - - group_scores = (scores_for_choice.view( - -1, self.n_group, - self.n_routed_experts // self.n_group).topk(2, - dim=-1)[0].sum(dim=-1)) - - group_idx = torch.topk(group_scores, - k=self.topk_group, - dim=-1, - sorted=False)[1] - group_mask = torch.zeros_like(group_scores) - group_mask.scatter_(1, group_idx, 1) - - score_mask = (group_mask.unsqueeze(-1).expand( - -1, self.n_group, self.n_routed_experts // self.n_group).reshape( - -1, self.n_routed_experts)) - - scores_for_choice = scores_for_choice.masked_fill( - ~score_mask.bool(), 0.0) - topk_indices = torch.topk(scores_for_choice, - k=self.top_k, - dim=-1, - sorted=False)[1] - return topk_indices - - def forward(self, hidden_states): - hidden_states = hidden_states.view(-1, self.config.hidden_size) - router_logits = F.linear(hidden_states.type(torch.float32), - self.weight.type(torch.float32)) - scores = router_logits.sigmoid() - - topk_indices = self.get_topk_indices(scores) - topk_weights = scores.gather(1, topk_indices) - - if self.norm_topk_prob: - denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 - topk_weights /= denominator - - topk_weights = topk_weights * self.routed_scaling_factor - return topk_indices, topk_weights - - class Glm4MoeSparseMoeBlock(nn.Module): def __init__( @@ -178,6 +108,7 @@ def __init__( self.config = config self.num_experts = config.n_routed_experts self.top_k = config.num_experts_per_tok + self.routed_scaling_factor = config.routed_scaling_factor self.norm_topk_prob = config.norm_topk_prob if self.tp_size > self.num_experts: @@ -185,18 +116,29 @@ def __init__( f"Tensor parallel size {self.tp_size} is greater than " f"the number of experts {self.num_experts}.") - self.gate = Glm4MoeTopkRouter(config=config, - quant_config=quant_config, - prefix=f"{prefix}.gate") + self.gate = ReplicatedLinear(config.hidden_size, + config.n_routed_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") - self.experts = FusedMoE(num_experts=self.num_experts, - top_k=self.top_k, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=self.norm_topk_prob, - quant_config=quant_config, - prefix=f"{prefix}.experts") + self.gate.e_score_correction_bias = nn.Parameter( + torch.zeros(config.n_routed_experts, dtype=torch.float32)) + + self.experts = FusedMoE( + num_experts=self.num_experts, + top_k=self.top_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=self.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=getattr(config, 'scoring_func', 'sigmoid'), + e_score_correction_bias=self.gate.e_score_correction_bias) self.shared_experts = Glm4MoeMLP( hidden_size=config.hidden_size, @@ -208,31 +150,25 @@ def __init__( prefix=f"{prefix}.shared_experts") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - - residuals = hidden_states - orig_shape = hidden_states.shape - hidden_dim = hidden_states.shape[-1] + num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - topk_indices, topk_weights = self.gate(hidden_states) - batch_size = hidden_states.shape[0] - router_logits = torch.zeros(batch_size, - self.num_experts, - device=hidden_states.device, - dtype=hidden_states.dtype) - - for i in range(batch_size): - router_logits[i, topk_indices[i]] = topk_weights[i] - routed_output = self.experts(hidden_states=hidden_states, - router_logits=router_logits) + + shared_output = self.shared_experts(hidden_states) + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + + routed_output = self.experts( + hidden_states=hidden_states, + router_logits=router_logits) * self.routed_scaling_factor if self.tp_size > 1: routed_output = self.experts.maybe_all_reduce_tensor_model_parallel( routed_output) - shared_output = self.shared_experts(residuals.view(-1, hidden_dim)) final_output = routed_output + shared_output - return final_output.view(orig_shape) + return final_output.view(num_tokens, hidden_dim) class Glm4MoeAttention(nn.Module): @@ -317,7 +253,7 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - + # Add qk-norm if self.add_qk_norm: q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim) @@ -367,6 +303,7 @@ def __init__( prefix=f"{prefix}.self_attn", ) + # `mlp_only_layers` in the config. layer_idx = extract_layer_index(prefix) if layer_idx >= getattr(config, "first_k_dense_replace", 1): self.mlp = Glm4MoeSparseMoeBlock(config=config, @@ -390,6 +327,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -402,6 +340,7 @@ def forward( hidden_states=hidden_states, ) + # Fully Connected hidden_states, residual = self.post_attention_layernorm( hidden_states, residual) hidden_states = self.mlp(hidden_states) @@ -555,8 +494,10 @@ def load_weights(self, weights: Iterable[tuple[str, if weight_name not in name: continue name = name.replace(weight_name, param_name) + # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue + # Skip loading extra parameters for GPTQ/modelopt models. if name.endswith( ignore_suffixes) and name not in params_dict: continue @@ -573,15 +514,14 @@ def load_weights(self, weights: Iterable[tuple[str, loaded_params.add(name) break else: - # Handle other parameters + # Skip loading extra parameters for GPTQ/modelopt models. if name.endswith( ignore_suffixes) and name not in params_dict: continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue - - # Remapping for FP8 kv-scale + # Remapping the name of FP8 kv-scale. if name.endswith("kv_scale"): remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py index c2728f0ed64c..db6050fac57c 100644 --- a/vllm/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -73,7 +73,7 @@ def __init__( IMAGE_ATOM_ID = -300 IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] -AutoConfig.register("aimv2", AIMv2Config) +# AutoConfig.register("aimv2", AIMv2Config) # ---------------------------------------------------------------------- From efcff2bad35e65845f7a50a3d29d6ff6c1645c0f Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 14:43:54 +0800 Subject: [PATCH 05/44] use ds loading(not work) --- vllm/model_executor/models/glm4_moe.py | 139 ++++++++++++------------- 1 file changed, 67 insertions(+), 72 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index cbeee1e22f9d..3b6e6ec301b0 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -22,8 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GLM-4-MOE model compatible with HuggingFace weights.""" - -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from typing import Any, Optional, Union import torch @@ -47,7 +47,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -429,11 +430,6 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] - # Skip loading extra parameters for GPTQ/modelopt models. - ignore_suffixes = (".bias", "_bias", ".k_scale", "_k_scale", - ".v_scale", "_v_scale", ".weight_scale", - "_weight_scale", ".input_scale", "_input_scale") - # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) expert_params_mapping = FusedMoE.make_expert_params_mapping( @@ -444,18 +440,10 @@ def load_weights(self, weights: Iterable[tuple[str, params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "gate.weight" in name and "experts" not in name: - if is_pp_missing_parameter(name, self): - continue - if name in params_dict: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - continue + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -467,82 +455,77 @@ def load_weights(self, weights: Iterable[tuple[str, # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name and "shared_experts" not in name: + if (("mlp.experts." in name) and name not in params_dict): continue - name = name.replace(weight_name, param_name) - - # Skip loading extra parameters for GPTQ/modelopt models. - if name.endswith(ignore_suffixes) and name not in params_dict: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: continue - # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue - if name not in params_dict: - continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) - loaded_params.add(name) break else: - if "mlp.experts" in name and "shared_experts" not in name: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): - continue - # Skip loading extra parameters for GPTQ/modelopt models. - if name.endswith( - ignore_suffixes) and name not in params_dict: - continue - if name not in params_dict: - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) - loaded_params.add(name) + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name_mapped, self): + continue + + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped break else: - # Skip loading extra parameters for GPTQ/modelopt models. - if name.endswith( - ignore_suffixes) and name not in params_dict: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it continue - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: continue + # Remapping the name of FP8 kv-scale. - if name.endswith("kv_scale"): - remapped_kv_scale_name = name.replace( - ".kv_scale", ".attn.kv_scale") - if remapped_kv_scale_name not in params_dict: - logger.warning_once( - "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.", - name, - remapped_kv_scale_name, - ) - continue - else: - name = remapped_kv_scale_name - - if name not in params_dict: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - loaded_params.add(name) + loaded_params.add(name) return loaded_params @@ -610,3 +593,15 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + +def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, + weight_name: str) -> Optional[int]: + if hasattr(config, + "num_nextn_predict_layers") and (config.num_nextn_predict_layers + > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if weight_name.startswith(f"model.layers.{layer_idx+i}."): + return layer_idx + i + return None From e6ad57675773ce146983ed43cfd6a2811c6e7e08 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 18:42:54 +0800 Subject: [PATCH 06/44] update Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_moe.py | 126 ++++++++++++------------- 1 file changed, 60 insertions(+), 66 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 3b6e6ec301b0..d6cf9c74133a 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -33,7 +33,9 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed import (get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -53,8 +55,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP -from .utils import (AutoWeightsLoader, extract_layer_index, - is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -96,7 +97,7 @@ def forward(self, x): return x -class Glm4MoeSparseMoeBlock(nn.Module): +class Glm4MoeE(nn.Module): def __init__( self, @@ -108,9 +109,13 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.config = config self.num_experts = config.n_routed_experts - self.top_k = config.num_experts_per_tok self.routed_scaling_factor = config.routed_scaling_factor self.norm_topk_prob = config.norm_topk_prob + self.n_shared_experts = config.n_shared_experts + + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") if self.tp_size > self.num_experts: raise ValueError( @@ -123,53 +128,52 @@ def __init__( quant_config=None, prefix=f"{prefix}.gate") + # noaux_tc is not wrote in config now self.gate.e_score_correction_bias = nn.Parameter( torch.zeros(config.n_routed_experts, dtype=torch.float32)) self.experts = FusedMoE( - num_experts=self.num_experts, - top_k=self.top_k, + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, reduce_results=False, - renormalize=self.norm_topk_prob, + renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, num_expert_group=config.n_group, topk_group=config.topk_group, prefix=f"{prefix}.experts", - scoring_func=getattr(config, 'scoring_func', 'sigmoid'), + scoring_func=config.scoring_func, e_score_correction_bias=self.gate.e_score_correction_bias) - self.shared_experts = Glm4MoeMLP( - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size * - config.n_shared_experts, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=False, - prefix=f"{prefix}.shared_experts") + if config.n_shared_experts is not None: + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + self.shared_experts = Glm4MoeMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_experts", + ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - - shared_output = self.shared_experts(hidden_states) - - # router_logits: (num_tokens, n_experts) + if self.n_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) router_logits, _ = self.gate(hidden_states) - - routed_output = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits) * self.routed_scaling_factor - + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: - routed_output = self.experts.maybe_all_reduce_tensor_model_parallel( - routed_output) - - final_output = routed_output + shared_output - - return final_output.view(num_tokens, hidden_dim) + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + return final_hidden_states.view(num_tokens, hidden_dim) class Glm4MoeAttention(nn.Module): @@ -181,9 +185,9 @@ def __init__( num_kv_heads: int, rope_theta: float = 10000, rope_scaling: Optional[dict[str, Any]] = None, - max_position_embeddings: int = 8192, + max_position_embeddings: int = 131072, head_dim: Optional[int] = None, - rms_norm_eps: float = 1e-06, + rms_norm_eps: float = 1e-05, qkv_bias: bool = False, add_qk_norm: bool = False, cache_config: Optional[CacheConfig] = None, @@ -254,7 +258,6 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - # Add qk-norm if self.add_qk_norm: q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim) @@ -286,7 +289,9 @@ def __init__( rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) + 131072) + layer_idx = int(prefix.split(sep='.')[-1]) + self.layer_idx = layer_idx self.self_attn = Glm4MoeAttention( hidden_size=self.hidden_size, @@ -296,20 +301,15 @@ def __init__( rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, 'attention_bias', False), - head_dim=getattr(config, 'head_dim', None), - add_qk_norm=getattr(config, 'add_qk_norm', False), cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", ) - # `mlp_only_layers` in the config. - layer_idx = extract_layer_index(prefix) - if layer_idx >= getattr(config, "first_k_dense_replace", 1): - self.mlp = Glm4MoeSparseMoeBlock(config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") + if layer_idx >= config.first_k_dense_replace: + self.mlp = Glm4MoeE(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") else: self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, @@ -328,20 +328,14 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: - # Self Attention if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: hidden_states, residual = self.input_layernorm( hidden_states, residual) - - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected + hidden_states = self.self_attn(positions=positions, + hidden_states=hidden_states) hidden_states, residual = self.post_attention_layernorm( hidden_states, residual) hidden_states = self.mlp(hidden_states) @@ -357,30 +351,31 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size self.config = config - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - prefix=f"{prefix}.embed_tokens") + self.vocab_size = config.vocab_size - num_layers = config.num_hidden_layers - if hasattr(config, 'num_nextn_predict_layers'): - num_layers = config.num_hidden_layers - config.num_nextn_predict_layers + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + else: + self.embed_tokens = PPMissingLayer() self.start_layer, self.end_layer, self.layers = make_layers( - num_layers, + config.num_hidden_layers, lambda prefix: Glm4MoeDecoderLayer(config=config, cache_config=cache_config, quant_config=quant_config, prefix=prefix), - prefix=f"{prefix}.layers", - ) + prefix=f"{prefix}.layers") - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) @@ -461,7 +456,6 @@ def load_weights(self, weights: Iterable[tuple[str, # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - if is_pp_missing_parameter(name, self): continue From d49c5bc00f0c584198f43a0ca8763f6770a58ff2 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 18:56:05 +0800 Subject: [PATCH 07/44] update 1 Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_moe.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index d6cf9c74133a..bbd62f573291 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -128,9 +128,9 @@ def __init__( quant_config=None, prefix=f"{prefix}.gate") - # noaux_tc is not wrote in config now - self.gate.e_score_correction_bias = nn.Parameter( - torch.zeros(config.n_routed_experts, dtype=torch.float32)) + # noaux_tc is not set in config now + self.gate.e_score_correction_bias = (nn.Parameter( + torch.empty(config.n_routed_experts))) self.experts = FusedMoE( num_experts=config.n_routed_experts, @@ -144,7 +144,7 @@ def __init__( num_expert_group=config.n_group, topk_group=config.topk_group, prefix=f"{prefix}.experts", - scoring_func=config.scoring_func, + scoring_func="sigmoid", e_score_correction_bias=self.gate.e_score_correction_bias) if config.n_shared_experts is not None: @@ -306,7 +306,8 @@ def __init__( prefix=f"{prefix}.self_attn", ) - if layer_idx >= config.first_k_dense_replace: + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace): self.mlp = Glm4MoeE(config=config, quant_config=quant_config, prefix=f"{prefix}.mlp") From c8ad31e0f8d694e79b56013d87f073318c2bf26c Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 21:27:08 +0800 Subject: [PATCH 08/44] test --- vllm/model_executor/models/glm4_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index bbd62f573291..105abba7c8cc 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -300,6 +300,7 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, + head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, cache_config=cache_config, quant_config=quant_config, From 2440756a46c9f94ea7b54639e3ccbaa49f724ebd Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 10 Jul 2025 23:07:10 +0800 Subject: [PATCH 09/44] Update glm4_moe.py --- vllm/model_executor/models/glm4_moe.py | 64 ++++++++------------------ 1 file changed, 19 insertions(+), 45 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 105abba7c8cc..418bfb290473 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -97,7 +97,7 @@ def forward(self, x): return x -class Glm4MoeE(nn.Module): +class Glm4MoE(nn.Module): def __init__( self, @@ -107,28 +107,20 @@ def __init__( ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() - self.config = config - self.num_experts = config.n_routed_experts self.routed_scaling_factor = config.routed_scaling_factor - self.norm_topk_prob = config.norm_topk_prob self.n_shared_experts = config.n_shared_experts if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") - if self.tp_size > self.num_experts: - raise ValueError( - f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {self.num_experts}.") - self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, quant_config=None, prefix=f"{prefix}.gate") - # noaux_tc is not set in config now + # noaux_tc is not set in transformers new config now self.gate.e_score_correction_bias = (nn.Parameter( torch.empty(config.n_routed_experts))) @@ -236,16 +228,19 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, + partial_rotary_factor=0.5, base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn") + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) if self.add_qk_norm: self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -259,15 +254,10 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.add_qk_norm: - q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, - self.head_dim) - q_by_head = self.q_norm(q_by_head) - q = q_by_head.view(q.shape) - - k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, - self.head_dim) - k_by_head = self.k_norm(k_by_head) - k = k_by_head.view(k.shape) + q = self.q_norm(q.reshape(-1, self.num_heads, + self.head_dim)).reshape(q.shape) + k = self.k_norm(k.reshape(-1, self.num_kv_heads, + self.head_dim)).reshape(k.shape) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) @@ -309,9 +299,9 @@ def __init__( if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace): - self.mlp = Glm4MoeE(config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") + self.mlp = Glm4MoE(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") else: self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, @@ -438,10 +428,6 @@ def load_weights(self, weights: Iterable[tuple[str, params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue # skip spec decode layers for main model - for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: @@ -589,15 +575,3 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) - - -def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, - weight_name: str) -> Optional[int]: - if hasattr(config, - "num_nextn_predict_layers") and (config.num_nextn_predict_layers - > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_nextn_predict_layers): - if weight_name.startswith(f"model.layers.{layer_idx+i}."): - return layer_idx + i - return None From 5e38b14a135a4aedde57227f614eb668106b9619 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 11 Jul 2025 00:03:40 +0800 Subject: [PATCH 10/44] update --- vllm/model_executor/models/glm4_moe.py | 49 ++++++++++++++++++++------ 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 418bfb290473..52b946865c83 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -32,7 +32,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -104,6 +104,7 @@ def __init__( config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() @@ -124,6 +125,22 @@ def __init__( self.gate.e_score_correction_bias = (nn.Parameter( torch.empty(config.n_routed_experts))) + # Load balancing settings. + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + self.enable_eplb = enable_eplb + + self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + self.experts = FusedMoE( num_experts=config.n_routed_experts, top_k=config.num_experts_per_tok, @@ -137,7 +154,9 @@ def __init__( topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func="sigmoid", - e_score_correction_bias=self.gate.e_score_correction_bias) + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) if config.n_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * @@ -147,7 +166,8 @@ def __init__( intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=False, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), prefix=f"{prefix}.shared_experts", ) @@ -228,7 +248,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - partial_rotary_factor=0.5, base=rope_theta, rope_scaling=rope_scaling, ) @@ -273,6 +292,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -299,9 +319,12 @@ def __init__( if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace): - self.mlp = Glm4MoE(config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") + self.mlp = Glm4MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, + ) else: self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, @@ -343,6 +366,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + enable_eplb = vllm_config.parallel_config.enable_eplb self.config = config self.vocab_size = config.vocab_size @@ -358,10 +382,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: Glm4MoeDecoderLayer(config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + lambda prefix: Glm4MoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + enable_eplb=enable_eplb, + ), prefix=f"{prefix}.layers") if get_pp_group().is_last_rank: From c227471e74bcb4c4ebd1360776966ea083abd590 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 11 Jul 2025 00:06:09 +0800 Subject: [PATCH 11/44] Update glm4_moe.py --- vllm/model_executor/models/glm4_moe.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 52b946865c83..ca6250b7b767 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -33,7 +33,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import (get_pp_group, +from vllm.distributed import (get_ep_group, get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.logger import init_logger @@ -109,7 +109,12 @@ def __init__( super().__init__() self.tp_size = get_tensor_model_parallel_world_size() self.routed_scaling_factor = config.routed_scaling_factor - self.n_shared_experts = config.n_shared_experts + + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts: int = config.n_routed_experts + self.n_shared_experts: int = config.n_shared_experts if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " From 9f3ab705a6cf0e4fb931956986f1833729d5c8fb Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 11 Jul 2025 10:52:01 +0800 Subject: [PATCH 12/44] 1 --- vllm/model_executor/models/glm4_moe.py | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index ca6250b7b767..b615cd434eef 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -305,6 +305,8 @@ def __init__( rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. layer_idx = int(prefix.split(sep='.')[-1]) self.layer_idx = layer_idx @@ -341,6 +343,7 @@ def __init__( eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.routed_scaling_factor = config.routed_scaling_factor def forward( self, @@ -580,6 +583,45 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + self.expert_weights = [] + + # Set MoE hyperparameters + self.num_moe_layers = (config.num_hidden_layers - + config.first_k_dense_replace) + self.num_expert_groups = config.n_group + + self.moe_layers: list[FusedMoE] = [] + for layer in self.model.layers: + assert isinstance(layer, Glm4MoeDecoderLayer) + if isinstance(layer.mlp, Glm4MoE): + self.moe_layers.append(layer.mlp.experts) + + # Pick last one layer since the first ones may be dense layers. + example_moe = typing.cast( + Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp) + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 8feace60a9db2620fbe869e5d6ce7f875201eab6 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 11 Jul 2025 13:13:37 +0800 Subject: [PATCH 13/44] use ds imp --- vllm/model_executor/models/glm4_moe.py | 227 +++++++++++++------------ 1 file changed, 117 insertions(+), 110 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index b615cd434eef..28eddbf8fa0b 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -34,8 +34,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import (get_ep_group, get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) + get_tensor_model_parallel_world_size) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -55,7 +54,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, +from .utils import (PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -188,8 +187,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if shared_output is not None: final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) + final_hidden_states = ( + self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states)) return final_hidden_states.view(num_tokens, hidden_dim) @@ -441,6 +441,117 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + +class Glm4MoeForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Glm4MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + self.expert_weights = [] + + # Set MoE hyperparameters + self.num_moe_layers = (config.num_hidden_layers - + config.first_k_dense_replace) + self.num_expert_groups = config.n_group + + self.moe_layers: list[FusedMoE] = [] + for layer in self.model.layers: + assert isinstance(layer, Glm4MoeDecoderLayer) + if isinstance(layer.mlp, Glm4MoE): + self.moe_layers.append(layer.mlp.experts) + + # Pick last one layer since the first ones may be dense layers. + example_moe = typing.cast( + Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp) + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ @@ -544,108 +655,4 @@ def load_weights(self, weights: Iterable[tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) - return loaded_params - - -class Glm4MoeForCausalLM(nn.Module, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - - self.model = Glm4MoeModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - self.expert_weights = [] - - # Set MoE hyperparameters - self.num_moe_layers = (config.num_hidden_layers - - config.first_k_dense_replace) - self.num_expert_groups = config.n_group - - self.moe_layers: list[FusedMoE] = [] - for layer in self.model.layers: - assert isinstance(layer, Glm4MoeDecoderLayer) - if isinstance(layer.mlp, Glm4MoE): - self.moe_layers.append(layer.mlp.experts) - - # Pick last one layer since the first ones may be dense layers. - example_moe = typing.cast( - Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp) - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loaded_params \ No newline at end of file From 12666d3da2101a2ed127e7ebd1ca32d7fb07fb9b Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 11 Jul 2025 22:27:45 +0800 Subject: [PATCH 14/44] update and merge --- vllm/model_executor/models/glm4_moe.py | 204 ++++++++++++------------ vllm/transformers_utils/configs/ovis.py | 2 +- 2 files changed, 106 insertions(+), 100 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 28eddbf8fa0b..0bb465267c28 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -54,7 +54,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP -from .utils import (PPMissingLayer, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -178,6 +178,7 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) + if self.n_shared_experts is not None: shared_output = self.shared_experts(hidden_states) router_logits, _ = self.gate(hidden_states) @@ -441,103 +442,6 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - -class Glm4MoeForCausalLM(nn.Module, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = Glm4MoeModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - else: - self.lm_head = PPMissingLayer() - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - self.expert_weights = [] - - # Set MoE hyperparameters - self.num_moe_layers = (config.num_hidden_layers - - config.first_k_dense_replace) - self.num_expert_groups = config.n_group - - self.moe_layers: list[FusedMoE] = [] - for layer in self.model.layers: - assert isinstance(layer, Glm4MoeDecoderLayer) - if isinstance(layer.mlp, Glm4MoE): - self.moe_layers.append(layer.mlp.experts) - - # Pick last one layer since the first ones may be dense layers. - example_moe = typing.cast( - Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp) - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def set_eplb_state( - self, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - for layer_idx, layer in enumerate(self.moe_layers): - # Register the expert weights. - self.expert_weights.append(layer.get_expert_weights()) - layer.set_eplb_state( - moe_layer_idx=layer_idx, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - def make_empty_intermediate_tensors( self, batch_size: int, dtype: torch.dtype, device: torch.device) -> IntermediateTensors: @@ -655,4 +559,106 @@ def load_weights(self, weights: Iterable[tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) - return loaded_params \ No newline at end of file + return loaded_params + + +class Glm4MoeForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Glm4MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + self.expert_weights = [] + + # Set MoE hyperparameters + self.num_moe_layers = (config.num_hidden_layers - + config.first_k_dense_replace) + self.num_expert_groups = config.n_group + + self.moe_layers: list[FusedMoE] = [] + for layer in self.model.layers: + assert isinstance(layer, Glm4MoeDecoderLayer) + if isinstance(layer.mlp, Glm4MoE): + self.moe_layers.append(layer.mlp.experts) + + # Pick last one layer since the first ones may be dense layers. + example_moe = typing.cast( + Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp) + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py index db6050fac57c..c2728f0ed64c 100644 --- a/vllm/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -73,7 +73,7 @@ def __init__( IMAGE_ATOM_ID = -300 IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] -# AutoConfig.register("aimv2", AIMv2Config) +AutoConfig.register("aimv2", AIMv2Config) # ---------------------------------------------------------------------- From ffe9d62e42a7b222eee5f7e3c2af3fc5e2ebabe7 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 12 Jul 2025 22:28:43 +0800 Subject: [PATCH 15/44] fix partial_rotary_factor Signed-off-by: Isotr0py --- vllm/model_executor/models/glm4_moe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 0bb465267c28..c8e7988f6dea 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -198,6 +198,7 @@ class Glm4MoeAttention(nn.Module): def __init__( self, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -250,12 +251,14 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj") + partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, + partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( self.num_heads, @@ -312,6 +315,7 @@ def __init__( self.layer_idx = layer_idx self.self_attn = Glm4MoeAttention( + config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, @@ -320,6 +324,7 @@ def __init__( max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, + qkv_bias=config.attention_bias, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", From dbf1719cfd05992a81663757515280230a1b57a0 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Mon, 14 Jul 2025 00:02:45 +0800 Subject: [PATCH 16/44] Update for doc --- benchmarks/kernels/benchmark_moe_permute_unpermute.py | 6 ++++-- docs/models/supported_models.md | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index dba1f3943b96..e503307f37ce 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -321,10 +321,12 @@ def main(args: argparse.Namespace): ): E = config.n_routed_experts topk = config.num_experts_per_tok - elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]: + elif ( + config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"] + or config.architectures[0] == "Glm4MoeForCausalLM" + ): E = config.num_experts topk = config.num_experts_per_tok - else: # Support for llama4 config = config.get_text_config() diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index eca37a09058a..f36742501fc2 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -573,6 +573,7 @@ Specified using `--task generate`. | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `THUDM/GLM-4.1V-9B-Thinkg`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4-MoE | T + IE+ + VE+ | `THUDM/GLM-4-MoE-100B-A10B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | From 0ea4b998abba2eca6db111b00341d07b11e18caa Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Tue, 15 Jul 2025 15:14:56 +0800 Subject: [PATCH 17/44] update for GLM MPT draft --- tests/models/registry.py | 2 + vllm/config.py | 9 +- vllm/engine/arg_utils.py | 3 +- vllm/model_executor/models/glm4_moe.py | 1 + vllm/model_executor/models/glm4_moe_mtp.py | 285 +++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 6 files changed, 297 insertions(+), 4 deletions(-) create mode 100644 vllm/model_executor/models/glm4_moe_mtp.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 449719991cd9..c53d9443a5a9 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -470,6 +470,8 @@ def check_available_online( is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), + "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4-MoE", + speculative_model="THUDM/GLM-4-MoE"), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL") diff --git a/vllm/config.py b/vllm/config.py index 42410006f60d..0264b1931d16 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2545,7 +2545,8 @@ def __post_init__(self): SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", - "mlp_speculator", "draft_model", "deepseek_mtp"] + "mlp_speculator", "draft_model", "deepseek_mtp", + "glm4_moe_mtp"] SpeculativeAcceptanceMethod = Literal["rejection_sampler", "typical_acceptance_sampler"] @@ -2805,8 +2806,10 @@ def __post_init__(self): elif (self.draft_model_config.hf_config.model_type == "mlp_speculator"): self.method = "mlp_speculator" - elif (self.draft_model_config.hf_config.model_type == - "deepseek_mtp"): + elif (self.draft_model_config.hf_config.model_type + == "deepseek_mtp" + or self.draft_model_config.hf_config.model_type + == "glm4_moe_mtp"): self.method = "deepseek_mtp" if self.num_speculative_tokens > 1: logger.warning( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 269477c48481..84ebabcfc283 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1507,7 +1507,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: is_ngram_enabled = True elif speculative_method == "medusa": is_medusa_enabled = True - elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"): + elif speculative_method in ("eagle", "eagle3", "deepseek_mtp", + "glm4_moe_mtp"): is_eagle_enabled = True else: speculative_model = self.speculative_config.get("model") diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index c8e7988f6dea..6cdb02bb1b85 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -328,6 +328,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", + add_qk_norm=config.add_qk_norm, ) if (config.n_routed_experts is not None diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py new file mode 100644 index 000000000000..70d90c922bd6 --- /dev/null +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .deepseek_v2 import get_spec_layer_idx_from_weight_name +from .glm4_moe import Glm4MoeDecoderLayer +from .interfaces import SupportsPP +from .utils import maybe_prefix + + +class SharedHead(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.norm(hidden_states) + + +class Glm4MoeMultiTokenPredictorLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.eh_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.shared_head = SharedHead(config=config, quant_config=quant_config) + self.mtp_block = Glm4MoeDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds[positions == 0] = 0 + inputs_embeds = self.enorm(inputs_embeds) + previous_hidden_states = self.hnorm(previous_hidden_states) + + hidden_states = self.eh_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + residual=None) + hidden_states = residual + hidden_states + return hidden_states + + +class Glm4MoeMultiTokenPredictor(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + Glm4MoeMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + cache_config=vllm_config.cache_config, + quant_config=vllm_config.quant_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + current_step_idx = (spec_step_idx % self.num_mtp_layers) + return self.layers[str(self.mtp_start_layer_idx + current_step_idx)]( + input_ids, + positions, + previous_hidden_states, + inputs_embeds, + current_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + current_step_idx = (spec_step_idx % self.num_mtp_layers) + mtp_layer = self.layers[str(self.mtp_start_layer_idx + + current_step_idx)] + logits = self.logits_processor(mtp_layer.shared_head.head, + mtp_layer.shared_head(hidden_states), + sampling_metadata) + return logits + + +class Glm4MoeMTP(nn.Module, SupportsPP): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.config = vllm_config.model_config.hf_config + self.model = Glm4MoeMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, + previous_hidden_states, inputs_embeds, + spec_step_idx) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> Optional[torch.Tensor]: + return self.model.compute_logits(hidden_states, sampling_metadata, + spec_step_idx) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is None: + continue + name = self._rewrite_spec_layer_name(spec_layer, name) + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # According to DeepSeek-V3 Technical Report, MTP modules + # shares embedding layer. We only load the first weights. + if (spec_layer != self.model.mtp_start_layer_idx + and ".layers" not in name): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str: + """ + Rewrite the weight name to match the format of the original model. + Add .mtp_block for modules in transformer layer block for spec layer + and rename shared layer weights to be top level. + """ + spec_layer_weight_names = [ + "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head" + ] + shared_weight_names = ["embed_tokens"] + spec_layer_weight = False + shared_weight = False + for weight_name in spec_layer_weight_names: + if weight_name in name: + spec_layer_weight = True + if weight_name in shared_weight_names: + shared_weight = True + break + if not spec_layer_weight: + # treat rest weights as weights for transformer layer block + name = name.replace(f"model.layers.{spec_layer}.", + f"model.layers.{spec_layer}.mtp_block.") + elif shared_weight: + # treat shared weights as top level weights + name = name.replace(f"model.layers.{spec_layer}.", "model.") + return name diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index af13450bd8d9..6db157b491d6 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -247,6 +247,7 @@ "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), + "Glm4MoeMTPForCausalLM": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } From 1c1551311b71c065e2f1f9897342954979c1bc14 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Tue, 15 Jul 2025 16:01:37 +0800 Subject: [PATCH 18/44] MTP and main model are diff, add error msgs --- vllm/model_executor/models/glm4_moe.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 6cdb02bb1b85..564adfa3f18a 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -22,6 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GLM-4-MOE model compatible with HuggingFace weights.""" +import json import typing from collections.abc import Callable, Iterable from typing import Any, Optional, Union @@ -586,6 +587,14 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + if hasattr(config, "num_nextn_predict_layers"): + hf_overrides = {"architectures": ["Glm4MoeMTPForCausalLM"]} + raise RuntimeError( + "The configuration of this model indicates that it supports " + "MTP, but you instantiated the main model without MTP" + "of this model. Please use the vision model by setting " + f"`--hf-overrides '{json.dumps(hf_overrides)}'`") + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config @@ -667,4 +676,4 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) From 744e071270005a572c6434a491f190dba6f12c14 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Tue, 15 Jul 2025 17:08:42 +0800 Subject: [PATCH 19/44] Update benchmark_moe.py --- benchmarks/kernels/benchmark_moe.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 9c5a5573b73e..5c83b980d4e3 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -566,14 +566,26 @@ def main(args: argparse.Namespace): if args.model_prefix: config = getattr(config, args.model_prefix) - if config.architectures[0] in ( - "DbrxForCausalLM", - "JambaForCausalLM", + if config.architectures[0] == "DbrxForCausalLM": + E = config.ffn_config.moe_num_experts + topk = config.ffn_config.moe_top_k + intermediate_size = config.ffn_config.ffn_hidden_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + intermediate_size = config.intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] in ( "DeepseekV3ForCausalLM", - "DeepseekV2ForCausalLMQwen2MoeForCausalLM", - "Qwen3MoeForCausalLM", + "DeepseekV2ForCausalLM", "Glm4MoeForCausalLM", ): + E = config.n_routed_experts + topk = config.num_experts_per_tok + intermediate_size = config.moe_intermediate_size + shard_intermediate_size = 2 * intermediate_size // args.tp_size + elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size From 5ab9ee66f15c44abffcd83d4f5707c94876ec259 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Tue, 15 Jul 2025 21:11:11 +0800 Subject: [PATCH 20/44] Update benchmark_moe_permute_unpermute.py --- benchmarks/kernels/benchmark_moe_permute_unpermute.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index e503307f37ce..4ed690090144 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -318,15 +318,14 @@ def main(args: argparse.Namespace): elif ( config.architectures[0] == "DeepseekV3ForCausalLM" or config.architectures[0] == "DeepseekV2ForCausalLM" + or config.architectures[0] == "Glm4MoeForCausalLM" ): E = config.n_routed_experts topk = config.num_experts_per_tok - elif ( - config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"] - or config.architectures[0] == "Glm4MoeForCausalLM" - ): + elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]: E = config.num_experts topk = config.num_experts_per_tok + else: # Support for llama4 config = config.get_text_config() From 2805cc344f2cd800a752c66a1b9ae3c38adebd3d Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Wed, 16 Jul 2025 14:49:55 +0800 Subject: [PATCH 21/44] use transformers name --- vllm/model_executor/models/glm4_moe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 564adfa3f18a..a947e08e2f21 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -209,7 +209,7 @@ def __init__( head_dim: Optional[int] = None, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, - add_qk_norm: bool = False, + use_qk_norm: bool = False, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -236,7 +236,7 @@ def __init__( self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings - self.add_qk_norm = add_qk_norm + self.use_qk_norm = use_qk_norm self.qkv_proj = QKVParallelLinear(hidden_size, self.head_dim, @@ -271,7 +271,7 @@ def __init__( prefix=f"{prefix}.attn", ) - if self.add_qk_norm: + if self.use_qk_norm: self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) @@ -282,7 +282,7 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - if self.add_qk_norm: + if self.use_qk_norm: q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape) k = self.k_norm(k.reshape(-1, self.num_kv_heads, @@ -329,7 +329,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", - add_qk_norm=config.add_qk_norm, + use_qk_norm=config.use_qk_norm, ) if (config.n_routed_experts is not None From 25edf51bd070eeb5475522f343614b045390fa96 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 00:42:23 +0800 Subject: [PATCH 22/44] Update config.py --- vllm/config.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6bf98617b5ad..3d6ea7c221b3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2657,7 +2657,12 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "n_predict": n_predict, "architectures": ["DeepSeekMTPModel"] }) - + if hf_config.model_type == "glm4_moe_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["Glm4MoeMTPModel"] + }) if hf_config.architectures[0] == "MiMoForCausalLM": hf_config.model_type = "mimo_mtp" n_predict = getattr(hf_config, "num_nextn_predict_layers", None) @@ -2684,10 +2689,8 @@ def __post_init__(self): # TODO(Shangming): Refactor mtp configuration logic when supporting # mtp acceleration for more models besides deepseek_v3 if self.target_model_config and \ - (self.target_model_config.hf_text_config.model_type \ - == "deepseek_v3" or - self.target_model_config.hf_text_config.model_type \ - == "mimo"): + (self.target_model_config.hf_text_config.model_type in + ["deepseek_v3","mimo","glm4_moe"]): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): From b94241e26b66179af5dcd279de74f849c48752bf Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 01:23:21 +0800 Subject: [PATCH 23/44] 1 --- vllm/model_executor/models/glm4_moe.py | 8 -------- vllm/model_executor/models/glm4_moe_mtp.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index a947e08e2f21..5ed856284e9d 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -587,14 +587,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - if hasattr(config, "num_nextn_predict_layers"): - hf_overrides = {"architectures": ["Glm4MoeMTPForCausalLM"]} - raise RuntimeError( - "The configuration of this model indicates that it supports " - "MTP, but you instantiated the main model without MTP" - "of this model. Please use the vision model by setting " - f"`--hf-overrides '{json.dumps(hf_overrides)}'`") - quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 70d90c922bd6..dde060c35616 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -60,7 +60,7 @@ def __init__( self.mtp_block = Glm4MoeDecoderLayer(config=config, cache_config=cache_config, quant_config=quant_config, - prefix=prefix), + prefix=prefix) def forward( self, From efeb607b1f0f50536f1d069342a1e52289dc9f6c Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 01:28:06 +0800 Subject: [PATCH 24/44] update --- vllm/model_executor/models/glm4_moe.py | 1 - vllm/worker/worker.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 5ed856284e9d..a7db4fc77500 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -22,7 +22,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GLM-4-MOE model compatible with HuggingFace weights.""" -import json import typing from collections.abc import Callable, Iterable from typing import Any, Optional, Union diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b2926dbd185a..6b6943d76436 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -77,7 +77,8 @@ def __init__( "mlp_speculator", "eagle", "deepseek_mtp", - "mimo_mtp")) \ + "glm4_moe_mtp", + "mimo_mtp")) \ else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner From 64bbe48243f1351e85ec8a4bda7c8c705c9f7c7b Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 01:52:53 +0800 Subject: [PATCH 25/44] Update config.py --- vllm/config.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3d6ea7c221b3..2deed5589496 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2657,11 +2657,12 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "n_predict": n_predict, "architectures": ["DeepSeekMTPModel"] }) - if hf_config.model_type == "glm4_moe_mtp": + if hf_config.architectures[0] == "Glm4MoeForCausalLM": + hf_config.model_type = "glm4_moe_mtp" n_predict = getattr(hf_config, "num_nextn_predict_layers", None) hf_config.update({ "n_predict": n_predict, - "architectures": ["Glm4MoeMTPModel"] + "architectures": ["Glm4MoeMTPForCausalLM"] }) if hf_config.architectures[0] == "MiMoForCausalLM": hf_config.model_type = "mimo_mtp" @@ -2671,8 +2672,6 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "n_predict": n_predict, "architectures": ["MiMoMTPModel"] }) - return hf_config - return hf_config def __post_init__(self): @@ -2690,7 +2689,7 @@ def __post_init__(self): # mtp acceleration for more models besides deepseek_v3 if self.target_model_config and \ (self.target_model_config.hf_text_config.model_type in - ["deepseek_v3","mimo","glm4_moe"]): + ('deepseek_v3', 'mimo', 'glm4_moe')): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): From 078e2ef9a117a12595aba0736b74c759c9a3fbc5 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 12:35:16 +0800 Subject: [PATCH 26/44] Update glm4_moe.py --- vllm/model_executor/models/glm4_moe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index a7db4fc77500..97be201d51be 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -53,6 +53,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .deepseek_v2 import get_spec_layer_idx_from_weight_name from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -484,6 +485,9 @@ def load_weights(self, weights: Iterable[tuple[str, params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: From 7d4340fe0eaa99992f3dfe061515cb70fc463d5a Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 12:55:09 +0800 Subject: [PATCH 27/44] all use Glm4MoeMTPModel --- vllm/config.py | 2 +- vllm/model_executor/models/registry.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 2deed5589496..9a0f6eb2c4c0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2662,7 +2662,7 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: n_predict = getattr(hf_config, "num_nextn_predict_layers", None) hf_config.update({ "n_predict": n_predict, - "architectures": ["Glm4MoeMTPForCausalLM"] + "architectures": ["Glm4MoeMTPModel"] }) if hf_config.architectures[0] == "MiMoForCausalLM": hf_config.model_type = "mimo_mtp" diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 1332609ff824..845d1653f11b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -249,7 +249,7 @@ "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), - "Glm4MoeMTPForCausalLM": ("glm4_moe_mtp", "Glm4MoeMTP"), + "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } From a80072e15ca7b404258d846d41e20e976ba7c388 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 13:16:07 +0800 Subject: [PATCH 28/44] remove model prefix --- vllm/config.py | 11 ++++----- vllm/model_executor/models/glm4_moe.py | 13 ++++++++++- vllm/model_executor/models/glm4_moe_mtp.py | 26 ++++++++++++++++++++-- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9a0f6eb2c4c0..473833cf335f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2658,12 +2658,13 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "architectures": ["DeepSeekMTPModel"] }) if hf_config.architectures[0] == "Glm4MoeForCausalLM": - hf_config.model_type = "glm4_moe_mtp" n_predict = getattr(hf_config, "num_nextn_predict_layers", None) - hf_config.update({ - "n_predict": n_predict, - "architectures": ["Glm4MoeMTPModel"] - }) + if n_predict: # GLM-MoE have both MTP and Not MTP model + hf_config.update({ + "model_type": "glm4_moe_mtp", + "n_predict": n_predict, + "architectures": ["Glm4MoeMTPModel"] + }) if hf_config.architectures[0] == "MiMoForCausalLM": hf_config.model_type = "mimo_mtp" n_predict = getattr(hf_config, "num_nextn_predict_layers", None) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 97be201d51be..cb0f1fd74fb3 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -53,7 +53,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .deepseek_v2 import get_spec_layer_idx_from_weight_name from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -672,3 +671,15 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + +def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, + weight_name: str) -> Optional[int]: + if hasattr(config, + "num_nextn_predict_layers") and (config.num_nextn_predict_layers + > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if weight_name.startswith(f"layers.{layer_idx+i}."): + return layer_idx + i + return None diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index dde060c35616..217223c68c0e 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -1,5 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The ZhipuAI Team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-4-MOE MTP model compatible with HuggingFace weights.""" + from collections.abc import Iterable from typing import Optional @@ -18,8 +41,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .deepseek_v2 import get_spec_layer_idx_from_weight_name -from .glm4_moe import Glm4MoeDecoderLayer +from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name from .interfaces import SupportsPP from .utils import maybe_prefix From 427765812c974a90362bb07922b7c1a31a92a41d Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 15:20:10 +0800 Subject: [PATCH 29/44] update --- vllm/config.py | 4 +++- vllm/model_executor/models/glm4_moe.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 473833cf335f..aa0039883b0e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1327,9 +1327,11 @@ def get_layers_start_end_indices( self, parallel_config: "ParallelConfig") -> tuple[int, int]: from vllm.distributed.utils import get_pp_indices if (self.hf_text_config.model_type == "deepseek_mtp" - or self.hf_config.model_type == "mimo_mtp"): + or self.hf_config.model_type == "mimo_mtp" + or self.hf_config.model_type == "glm4_moe_mtp"): total_num_hidden_layers = getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + print(total_num_hidden_layers) else: total_num_hidden_layers = getattr(self.hf_text_config, "num_hidden_layers", 0) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index cb0f1fd74fb3..a93cc3223731 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -680,6 +680,6 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, > 0): layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): - if weight_name.startswith(f"layers.{layer_idx+i}."): + if weight_name.startswith(f"model.layers.{layer_idx+i}."): return layer_idx + i return None diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 7dda1cbfe230..acfc25372409 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -200,7 +200,9 @@ def create_worker( enable_lm_head_weight_load = True proposer_worker = MultiStepWorker(**draft_worker_kwargs) - if draft_model_config.hf_config.model_type == "deepseek_mtp": + if (draft_model_config.hf_config.model_type == "deepseek_mtp" + or draft_model_config.hf_config.model_type + == "glm4_moe_mtp"): num_spec_prefill_steps = \ draft_model_config.hf_config.n_predict From 4bb6b272d2bb86e8abb2f5ed4087f6e355842f1c Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 20:30:45 +0800 Subject: [PATCH 30/44] FC support --- tests/tool_use/test_glm4_moe_tool_parser.py | 411 ++++++++++++++++++ .../openai/tool_parsers/__init__.py | 3 +- .../tool_parsers/glm4_moe_tool_parser.py | 399 +++++++++++++++++ vllm/model_executor/models/glm4_moe.py | 2 +- 4 files changed, 813 insertions(+), 2 deletions(-) create mode 100644 tests/tool_use/test_glm4_moe_tool_parser.py create mode 100644 vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py new file mode 100644 index 000000000000..b624c0d1c179 --- /dev/null +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -0,0 +1,411 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +import json + +import pytest + +from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser +from vllm.transformers_utils.tokenizer import get_tokenizer + +# Use a common model that is likely to be available +MODEL = "/model/GLM-4.2-MoE-106B-A12B" + + +@pytest.fixture(scope="module") +def glm4_moe_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) + + +@pytest.fixture +def glm4_moe_tool_parser(glm4_moe_tokenizer): + return Glm4MoeModelToolParser(glm4_moe_tokenizer) + + +def assert_tool_calls(actual_tool_calls: list[ToolCall], + expected_tool_calls: list[ToolCall]): + assert len(actual_tool_calls) == len(expected_tool_calls) + + for actual_tool_call, expected_tool_call in zip(actual_tool_calls, + expected_tool_calls): + assert isinstance(actual_tool_call.id, str) + assert len(actual_tool_call.id) > 0 + + assert actual_tool_call.type == "function" + assert actual_tool_call.function.name == expected_tool_call.function.name + # Compare arguments as JSON objects to handle formatting differences + actual_args = json.loads(actual_tool_call.function.arguments) + expected_args = json.loads(expected_tool_call.function.arguments) + assert actual_args == expected_args + + +def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): + model_output = "This is a test" + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output + + +@pytest.mark.parametrize( + ids=[ + "single_tool_call", + "multiple_tool_calls", + "tool_call_with_content_before", + "tool_call_with_mixed_args", + "tool_call_with_chinese_content", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ( + """get_current_weather + city + Dallas + state + TX + unit + fahrenheit + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + }), + )) + ], + None, + ), + ( + """get_current_weather + city + Dallas + state + TX + unit + fahrenheit + + get_current_weather + city + Orlando + state + FL + unit + fahrenheit + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + }), + )), + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit", + }), + )), + ], + None, + ), + ( + """I'll help you check the weather. get_current_weather + city + Seattle + state + WA + unit + celsius + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Seattle", + "state": "WA", + "unit": "celsius", + }), + )) + ], + "I'll help you check the weather.", + ), + ( + """get_current_weather + city + New York + state + NY + unit + celsius + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "New York", + "state": "NY", + "unit": "celsius", + }), + )) + ], + None, + ), + ( + """I will help you get the weather.get_weather + city + Beijing + date + 2025-08-01 + """, + [ + ToolCall(function=FunctionCall( + name="get_weather", + arguments=json.dumps({ + "city": "Beijing", + "date": "2025-08-01", + }), + )) + ], + "I will help you get the weather." + ), + ], +) +def test_extract_tool_calls(glm4_moe_tool_parser, model_output, + expected_tool_calls, expected_content): + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert extracted_tool_calls.tools_called + assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) + + assert extracted_tool_calls.content == expected_content + + +def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser): + """Test tool extraction when thinking tags are present.""" + model_output = """I want to get the weather. + +I will help you get the weather. +get_weather +city +Beijing +date +2025-08-01 +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "get_weather" + + expected_content = """I want to get the weather. + +I will help you get the weather.""" + assert extracted_tool_calls.content == expected_content + + +def test_extract_tool_calls_malformed_xml(glm4_moe_tool_parser): + """Test that malformed XML is handled gracefully.""" + model_output = """get_weather +city +Seattle +incomplete_arg +value +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + # Should handle malformed XML gracefully + # The parser should either extract what it can or return no tool calls + # depending on how robust we want the parsing to be + assert isinstance(extracted_tool_calls.tools_called, bool) + assert isinstance(extracted_tool_calls.tool_calls, list) + + +def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser): + """Test tool calls with no arguments.""" + model_output = """get_current_time +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "get_current_time" + # Empty arguments should result in empty JSON object + assert extracted_tool_calls.tool_calls[0].function.arguments == "{}" + + +def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser): + """Test extraction with mixed content and multiple tool calls.""" + model_output = """I will help you get the weather info. + +get_weather +city +Beijing +date +2025-08-01 + + +meaningwhile, I will also check the weather in Shanghai. + +get_weather +city +Shanghai +date +2025-08-01 +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 2 + + # Check first tool call + assert extracted_tool_calls.tool_calls[0].function.name == "get_weather" + args1 = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args1["city"] == "Beijing" + assert args1["date"] == "2025-08-01" + + # Check second tool call + assert extracted_tool_calls.tool_calls[1].function.name == "get_weather" + args2 = json.loads(extracted_tool_calls.tool_calls[1].function.arguments) + assert args2["city"] == "Shanghai" + assert args2["date"] == "2025-08-01" + + # Content should be everything before the first tool call + assert extracted_tool_calls.content == "I will help you get the weather info." + + +def test_streaming_basic_functionality(glm4_moe_tool_parser): + """Test basic streaming functionality.""" + # Reset streaming state + glm4_moe_tool_parser.current_tool_name_sent = False + glm4_moe_tool_parser.prev_tool_call_arr = [] + glm4_moe_tool_parser.current_tool_id = -1 + glm4_moe_tool_parser.streamed_args_for_tool = [] + + # Test with a simple tool call + current_text = """get_weather +city +Beijing +""" + + # Mock token IDs for testing + tool_call_start_id = glm4_moe_tool_parser.tool_call_start_token_id or 12345 + tool_call_end_id = glm4_moe_tool_parser.tool_call_end_token_id or 12346 + + result = glm4_moe_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text=current_text, + delta_text="", + previous_token_ids=[], + current_token_ids=[tool_call_start_id, tool_call_end_id], + delta_token_ids=[tool_call_end_id], + request=None, + ) + + # The result behavior depends on the streaming state + # This test mainly ensures no exceptions are thrown + assert result is None or hasattr(result, 'tool_calls') or hasattr(result, 'content') + + +def test_streaming_no_tool_calls(glm4_moe_tool_parser): + """Test streaming when there are no tool calls.""" + current_text = "This is just regular text without any tool calls." + + result = glm4_moe_tool_parser.extract_tool_calls_streaming( + previous_text="This is just regular text", + current_text=current_text, + delta_text=" without any tool calls.", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + # Should return the delta text as content + assert result is not None + assert hasattr(result, 'content') + assert result.content == " without any tool calls." + + +def test_streaming_with_content_before_tool_calls(glm4_moe_tool_parser): + """Test streaming when there's content before tool calls.""" + # Reset streaming state + glm4_moe_tool_parser.current_tool_name_sent = False + glm4_moe_tool_parser.prev_tool_call_arr = [] + glm4_moe_tool_parser.current_tool_id = -1 + glm4_moe_tool_parser.streamed_args_for_tool = [] + + current_text = "I will help you get the weather" + + result = glm4_moe_tool_parser.extract_tool_calls_streaming( + previous_text="I will help you", + current_text=current_text, + delta_text="get the weather.", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + # Should return content when no tool call tokens are detected + assert result is not None + assert hasattr(result, 'content') + assert result.content == "get the weather." + + +def test_extract_tool_calls_special_characters(glm4_moe_tool_parser): + """Test tool calls with special characters and unicode.""" + model_output = """send_message +recipient +Amy +message +It is a nice day +priority +high +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "send_message" + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["recipient"] == "Amy" + assert args["message"] == "It is a nice day" + assert args["priority"] == "high" + + +def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser): + """Test incomplete tool calls (missing closing tag).""" + model_output = """get_weather +city +Beijing +date +2025-08-01""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + # Incomplete tool calls should not be extracted + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output \ No newline at end of file diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 218a120a5bb0..aee28fef9ef1 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -3,6 +3,7 @@ from .abstract_tool_parser import ToolParser, ToolParserManager from .deepseekv3_tool_parser import DeepSeekV3ToolParser +from .glm4_moe_tool_parser import Glm4MoeModelToolParser from .granite_20b_fc_tool_parser import Granite20bFCToolParser from .granite_tool_parser import GraniteToolParser from .hermes_tool_parser import Hermes2ProToolParser @@ -23,5 +24,5 @@ "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser", "xLAMToolParser", "MinimaxToolParser", - "KimiK2ToolParser" + "KimiK2ToolParser", "Glm4MoeModelToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py new file mode 100644 index 000000000000..3bca42f42646 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -0,0 +1,399 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# code modified from deepseekv3_tool_parser.py + +from collections.abc import Sequence +from typing import Union + +import regex as re + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("glm4_moe") +class Glm4MoeModelToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.current_tool_name_sent = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id = -1 + self.streamed_args_for_tool: list[str] = [] + self.tool_call_start_token = "" + self.tool_call_end_token = "" + + # This is the key fix - we need to set the tool_calls_start_token used by the base class + self.tool_calls_start_token = self.tool_call_start_token + + # Updated regex for the XML-based format + self.tool_call_regex = re.compile( + r"\s*" + r"(?P[^\n<]+)\s*" # 函数名(到换行或 <) + r"(?P(?:\s*[^<]+\s*" + r"[^<]*\s*)*)\s*" + r"", + re.DOTALL, + ) + + # Regex for parsing individual arguments + self.arg_regex = re.compile( + r"(?P[^<]+)\s*(?P[^<]*)", + re.DOTALL, + ) + + # Streaming regex + self.stream_tool_call_portion_regex = re.compile( + r"(?P[^\n<]+)\s*" + r"(?P(?:\s*[^<]+\s*" + r"[^<]*\s*)*)", + re.DOTALL, + ) + + # For streaming, we also need a regex to match just the function name + self.stream_tool_call_name_regex = re.compile( + r"(?P[^\n<]+)", + re.DOTALL, + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction.") + + self.tool_call_start_token_id = self.vocab.get( + self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + def _parse_arguments(self, args_text: str) -> str: + """Parse XML-based arguments into JSON format.""" + if not args_text or not args_text.strip(): + return "{}" + + args_dict = {} + matches = self.arg_regex.findall(args_text) + + for key, value in matches: + args_dict[key.strip()] = value.strip() + + import json + return json.dumps(args_dict, ensure_ascii=False) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + # Find all tool calls in the output + function_call_matches = self.tool_call_regex.findall(model_output) + + logger.debug("function_call_matches: %s", function_call_matches) + + if not function_call_matches: + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + + tool_calls = [] + for i, match in enumerate(function_call_matches): + function_name, function_args_xml = match + function_name = function_name.strip() + + # Parse XML arguments to JSON + function_args_json = self._parse_arguments(function_args_xml) + + tool_calls.append( + ToolCall( + id=f"call_{i}", + type='function', + function=FunctionCall(name=function_name, + arguments=function_args_json), + )) + + # Extract content before the first tool call + content = model_output[:model_output.find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=bool(tool_calls), + tool_calls=tool_calls, + content=content.strip() if content.strip() else None, + ) + + except Exception: + logger.exception( + "Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + + # check to see if we should be streaming a tool call + if self.tool_call_start_token_id not in current_token_ids: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + + # Remove tool call tokens from delta text for processing + delta_text = delta_text.replace(self.tool_call_start_token, + "").replace(self.tool_call_end_token, + "") + try: + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_token_ids.count( + self.tool_call_start_token_id) + prev_tool_end_count = previous_token_ids.count( + self.tool_call_end_token_id) + cur_tool_start_count = current_token_ids.count( + self.tool_call_start_token_id) + cur_tool_end_count = current_token_ids.count( + self.tool_call_end_token_id) + + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if (cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text): + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + # Handle tool call end + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = full_text.split( + self.tool_call_start_token)[-1].split( + self.tool_call_end_token)[0].strip() + delta_text = delta_text.split( + self.tool_call_end_token)[0].strip() + text_portion = delta_text.split( + self.tool_call_end_token)[-1].strip() + + # case -- we're starting a new tool call + if (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + else: + tool_call_portion = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count): + + # get the portion of the text that's the tool call + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif (cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count): + if self.prev_tool_call_arr is None or len( + self.prev_tool_call_arr) == 0: + logger.debug( + "attempting to close tool call, but no tool call") + return None + + # Handle any remaining arguments + if self.current_tool_id < len(self.prev_tool_call_arr): + current_tool_call = self.prev_tool_call_arr[self.current_tool_id] + if current_tool_call.get("arguments"): + remaining_args = current_tool_call["arguments"] + if remaining_args not in self.streamed_args_for_tool[self.current_tool_id]: + diff = remaining_args[len(self.streamed_args_for_tool[self.current_tool_id]):] + if diff: + self.streamed_args_for_tool[self.current_tool_id] += diff + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump(exclude_none=True), + ) + ]) + return None + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + current_tool_call = dict() + if tool_call_portion: + current_tool_call_matches = ( + self.stream_tool_call_portion_regex.match( + tool_call_portion)) + if current_tool_call_matches: + function_name = current_tool_call_matches.group("function_name") + function_args_xml = current_tool_call_matches.group("function_arguments") or "" + + current_tool_call['id'] = f"call_{self.current_tool_id}" + current_tool_call["name"] = function_name.strip() + current_tool_call["arguments"] = self._parse_arguments(function_args_xml) + else: + current_tool_call_name_matches = ( + self.stream_tool_call_name_regex.match( + tool_call_portion)) + if current_tool_call_name_matches: + function_name = current_tool_call_name_matches.group("function_name") + current_tool_call['id'] = f"call_{self.current_tool_id}" + current_tool_call["name"] = function_name.strip() + current_tool_call["arguments"] = "{}" + else: + logger.debug("Not enough tokens to parse tool call") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if current_tool_call is None: + return None + function_name: Union[str, None] = current_tool_call.get("name") + tool_id = current_tool_call.get("id") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=tool_id, + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True), + ) + ]) + else: + return None + + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = (DeltaMessage( + content=delta_text) if text_portion is not None else None) + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug("Trying to parse current tool call with ID %s", + self.current_tool_id) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments") + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but none are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error("should be impossible to have arguments reset " + "mid-call. skipping streaming anything.") + delta = None + + # case -- we now have the first info about arguments available + elif cur_arguments and not prev_arguments: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=cur_arguments).model_dump( + exclude_none=True), + ) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] = cur_arguments + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + if (isinstance(delta_text, str) + and cur_arguments != prev_arguments + and len(cur_arguments) > len(prev_arguments) + and cur_arguments.startswith(prev_arguments)): + delta_arguments = cur_arguments[len(prev_arguments):] + logger.debug("got diff %s", delta_arguments) + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=delta_arguments).model_dump( + exclude_none=True), + ) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] = cur_arguments + else: + delta = None + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[ + self.current_tool_id] = current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. \ No newline at end of file diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index a93cc3223731..cb0f1fd74fb3 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -680,6 +680,6 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, > 0): layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): - if weight_name.startswith(f"model.layers.{layer_idx+i}."): + if weight_name.startswith(f"layers.{layer_idx+i}."): return layer_idx + i return None From f4f4c52522f22bd960d896439520664e022cf7c8 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 21:17:19 +0800 Subject: [PATCH 31/44] glm-4-moe support close thinking --- vllm/reasoning/__init__.py | 2 + vllm/reasoning/glm4_moe_reasoning_parser.py | 151 ++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 vllm/reasoning/glm4_moe_reasoning_parser.py diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 3e5485b883f1..59cbb8430e2c 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -6,6 +6,7 @@ from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser from .qwen3_reasoning_parser import Qwen3ReasoningParser +from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser __all__ = [ "ReasoningParser", @@ -14,4 +15,5 @@ "GraniteReasoningParser", "HunyuanA13BReasoningParser", "Qwen3ReasoningParser", + "Glm4MoeModelReasoningParser", ] diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py new file mode 100644 index 000000000000..bc9e8b7abf82 --- /dev/null +++ b/vllm/reasoning/glm4_moe_reasoning_parser.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from typing import Optional, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("glm4_moe") +class Glm4MoeModelReasoningParser(ReasoningParser): + """ + Reasoning parser for the Glm4MoeModel model. + + The Glm4MoeModel model uses ... tokens to denote reasoning text + within its output. The model provides a strict switch to disable reasoning + output via the 'enable_thinking=False' parameter. This parser extracts the + reasoning content enclosed by and tokens from the model's + output. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.think_start_token = "" + self.think_end_token = "" + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + self.think_start_token_id = self.vocab.get(self.think_start_token) + self.think_end_token_id = self.vocab.get(self.think_end_token) + if (self.think_start_token_id is None + or self.think_end_token_id is None): + raise RuntimeError( + "Glm4MoeModel reasoning parser could not locate think start/end " + "tokens in the tokenizer!") + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return self.think_end_token_id in input_ids + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + """ + Extract the content after the end tokens + """ + if self.think_end_token_id not in input_ids[:-1]: + return [] + else: + return input_ids[input_ids.index(self.think_end_token_id) + 1:] + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + """ + # Skip single special tokens + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ + self.think_start_token_id, self.think_end_token_id + ]): + return None + + if self.think_start_token_id in previous_token_ids: + if self.think_end_token_id in delta_token_ids: + # in previous, in delta, + # extract reasoning content + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + elif self.think_end_token_id in previous_token_ids: + # in previous, in previous, + # reasoning content continues + return DeltaMessage(content=delta_text) + else: + # in previous, no in previous or delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + elif self.think_start_token_id in delta_token_ids: + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + start_index = delta_text.find(self.think_start_token) + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[start_index + + len(self.think_start_token + ):end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + else: + # in delta, no in delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + else: + # thinking is disabled, just content + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from the model output. + + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + + Returns: + tuple[Optional[str], Optional[str]]: reasoning content and content + """ + + # Check if the model output contains the and tokens. + if (self.think_start_token not in model_output + or self.think_end_token not in model_output): + return None, model_output + # Check if the is present in the model output, remove it + # if it is present. + model_output_parts = model_output.partition(self.think_start_token) + model_output = model_output_parts[2] if model_output_parts[ + 1] else model_output_parts[0] + # Check if the model output contains the tokens. + # If the end token is not found, return the model output as is. + if self.think_end_token not in model_output: + return None, model_output + + # Extract reasoning content from the model output. + reasoning_content, _, content = model_output.partition( + self.think_end_token) + + final_content = content or None + return reasoning_content, final_content From 4a39af8f689500493acf7e5d5a558b63edfde618 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Thu, 17 Jul 2025 22:53:30 +0800 Subject: [PATCH 32/44] Update test_glm4_moe_tool_parser.py --- tests/tool_use/test_glm4_moe_tool_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index b624c0d1c179..cd2ceb63c99c 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -11,7 +11,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer # Use a common model that is likely to be available -MODEL = "/model/GLM-4.2-MoE-106B-A12B" +MODEL = "THUDM/GLM-4-MoE" @pytest.fixture(scope="module") From 762a339ef83e4bea754a41d2e9bf9c55ed1e8e7d Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Thu, 17 Jul 2025 13:00:35 -0700 Subject: [PATCH 33/44] fix get_spec_layer_idx_from_weight_name for glm4 Signed-off-by: Lu Fang --- vllm/model_executor/models/glm4_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index cb0f1fd74fb3..82f73ff97d9d 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -680,6 +680,6 @@ def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, > 0): layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): - if weight_name.startswith(f"layers.{layer_idx+i}."): + if f"layers.{layer_idx+i}." in weight_name: return layer_idx + i return None From 235d24efdb27b163176ea17b8ae714dda09c82a7 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 18 Jul 2025 09:45:28 +0800 Subject: [PATCH 34/44] update --- tests/tool_use/test_glm4_moe_tool_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index cd2ceb63c99c..7eee73ae19a0 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -14,6 +14,7 @@ MODEL = "THUDM/GLM-4-MoE" +@pytest.mark.skip(reason="Temporarily disabled due to Model is not release") @pytest.fixture(scope="module") def glm4_moe_tokenizer(): return get_tokenizer(tokenizer_name=MODEL) From 8d38ab3982d16c9e72730966ba37676996b606d4 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 18 Jul 2025 10:00:58 +0800 Subject: [PATCH 35/44] updte --- tests/tool_use/test_glm4_moe_tool_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index 7eee73ae19a0..3d921ae6fb50 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -10,11 +10,11 @@ from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser from vllm.transformers_utils.tokenizer import get_tokenizer +pytest.skip("skip glm4_moe parser test", allow_module_level=True) # Use a common model that is likely to be available MODEL = "THUDM/GLM-4-MoE" -@pytest.mark.skip(reason="Temporarily disabled due to Model is not release") @pytest.fixture(scope="module") def glm4_moe_tokenizer(): return get_tokenizer(tokenizer_name=MODEL) From 5e9c51344f12646d028d877b12e1789510e4828f Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Fri, 18 Jul 2025 23:41:38 +0800 Subject: [PATCH 36/44] use new name --- docs/models/supported_models.md | 2 +- tests/models/registry.py | 6 +++--- tests/tool_use/test_glm4_moe_tool_parser.py | 2 +- vllm/model_executor/models/glm4_moe.py | 2 +- vllm/model_executor/models/glm4_moe_mtp.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 37c072a7f7e2..85d2df5d5af4 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -576,7 +576,7 @@ Specified using `--task generate`. | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4-MoE | T + IE+ + VE+ | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 4496295cb4b7..72d5eb8932a7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -363,7 +363,7 @@ def check_available_online( trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 - "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4-MoE-100B-A10B", min_transformers_version="4.54"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", min_transformers_version="4.54"), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 @@ -485,8 +485,8 @@ def check_available_online( is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), - "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4-MoE", - speculative_model="THUDM/GLM-4-MoE"), + "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5", + speculative_model="THUDM/GLM-4.5"), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL") diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index 3d921ae6fb50..0e7cb2ec5e35 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -12,7 +12,7 @@ pytest.skip("skip glm4_moe parser test", allow_module_level=True) # Use a common model that is likely to be available -MODEL = "THUDM/GLM-4-MoE" +MODEL = "THUDM/GLM-4.5" @pytest.fixture(scope="module") diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 82f73ff97d9d..bdca293d21db 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -21,7 +21,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only GLM-4-MOE model compatible with HuggingFace weights.""" +"""Inference-only GLM-4.5 model compatible with HuggingFace weights.""" import typing from collections.abc import Callable, Iterable from typing import Any, Optional, Union diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 217223c68c0e..0624640054d1 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -21,7 +21,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only GLM-4-MOE MTP model compatible with HuggingFace weights.""" +"""Inference-only GLM-4.5 MTP model compatible with HuggingFace weights.""" from collections.abc import Iterable from typing import Optional From 512201406f15753ab4673bd2c816537d8c99c14d Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 00:01:26 +0800 Subject: [PATCH 37/44] format --- tests/tool_use/test_glm4_moe_tool_parser.py | 144 +++++++++--------- .../openai/tool_parsers/__init__.py | 24 ++- .../tool_parsers/glm4_moe_tool_parser.py | 70 +++++---- vllm/reasoning/__init__.py | 2 +- 4 files changed, 131 insertions(+), 109 deletions(-) diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index 0e7cb2ec5e35..478f4b916672 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -62,7 +62,7 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ ( - """get_current_weather + """get_current_weather city Dallas state @@ -70,20 +70,20 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): unit fahrenheit """, - [ - ToolCall(function=FunctionCall( - name="get_current_weather", - arguments=json.dumps({ - "city": "Dallas", - "state": "TX", - "unit": "fahrenheit", - }), - )) - ], - None, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + }), + )) + ], + None, ), ( - """get_current_weather + """get_current_weather city Dallas state @@ -99,28 +99,28 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): unit fahrenheit """, - [ - ToolCall(function=FunctionCall( - name="get_current_weather", - arguments=json.dumps({ - "city": "Dallas", - "state": "TX", - "unit": "fahrenheit", - }), - )), - ToolCall(function=FunctionCall( - name="get_current_weather", - arguments=json.dumps({ - "city": "Orlando", - "state": "FL", - "unit": "fahrenheit", - }), - )), - ], - None, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + }), + )), + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit", + }), + )), + ], + None, ), ( - """I'll help you check the weather. get_current_weather + """I'll help you check the weather. get_current_weather city Seattle state @@ -128,20 +128,20 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): unit celsius """, - [ - ToolCall(function=FunctionCall( - name="get_current_weather", - arguments=json.dumps({ - "city": "Seattle", - "state": "WA", - "unit": "celsius", - }), - )) - ], - "I'll help you check the weather.", + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Seattle", + "state": "WA", + "unit": "celsius", + }), + )) + ], + "I'll help you check the weather.", ), ( - """get_current_weather + """get_current_weather city New York state @@ -149,36 +149,32 @@ def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): unit celsius """, - [ - ToolCall(function=FunctionCall( - name="get_current_weather", - arguments=json.dumps({ - "city": "New York", - "state": "NY", - "unit": "celsius", - }), - )) - ], - None, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "New York", + "state": "NY", + "unit": "celsius", + }), + )) + ], + None, ), - ( - """I will help you get the weather.get_weather + ("""I will help you get the weather.get_weather city Beijing date 2025-08-01 - """, - [ - ToolCall(function=FunctionCall( - name="get_weather", - arguments=json.dumps({ - "city": "Beijing", - "date": "2025-08-01", - }), - )) - ], - "I will help you get the weather." - ), + """, [ + ToolCall(function=FunctionCall( + name="get_weather", + arguments=json.dumps({ + "city": "Beijing", + "date": "2025-08-01", + }), + )) + ], "I will help you get the weather."), ], ) def test_extract_tool_calls(glm4_moe_tool_parser, model_output, @@ -245,7 +241,8 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser): assert extracted_tool_calls.tools_called assert len(extracted_tool_calls.tool_calls) == 1 - assert extracted_tool_calls.tool_calls[0].function.name == "get_current_time" + assert extracted_tool_calls.tool_calls[ + 0].function.name == "get_current_time" # Empty arguments should result in empty JSON object assert extracted_tool_calls.tool_calls[0].function.arguments == "{}" @@ -322,7 +319,8 @@ def test_streaming_basic_functionality(glm4_moe_tool_parser): # The result behavior depends on the streaming state # This test mainly ensures no exceptions are thrown - assert result is None or hasattr(result, 'tool_calls') or hasattr(result, 'content') + assert result is None or hasattr(result, 'tool_calls') or hasattr( + result, 'content') def test_streaming_no_tool_calls(glm4_moe_tool_parser): @@ -409,4 +407,4 @@ def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser): # Incomplete tool calls should not be extracted assert not extracted_tool_calls.tools_called assert extracted_tool_calls.tool_calls == [] - assert extracted_tool_calls.content == model_output \ No newline at end of file + assert extracted_tool_calls.content == model_output diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index c8a0f41c3a03..9eda7155f01f 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -20,10 +20,22 @@ from .xlam_tool_parser import xLAMToolParser __all__ = [ - "ToolParser", "ToolParserManager", "Granite20bFCToolParser", - "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", - "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", - "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser", - "DeepSeekV3ToolParser", "xLAMToolParser", "MinimaxToolParser", - "KimiK2ToolParser", "HunyuanA13BToolParser", "Glm4MoeModelToolParser", + "ToolParser", + "ToolParserManager", + "Granite20bFCToolParser", + "GraniteToolParser", + "Hermes2ProToolParser", + "MistralToolParser", + "Internlm2ToolParser", + "Llama3JsonToolParser", + "JambaToolParser", + "Llama4PythonicToolParser", + "PythonicToolParser", + "Phi4MiniJsonToolParser", + "DeepSeekV3ToolParser", + "xLAMToolParser", + "MinimaxToolParser", + "KimiK2ToolParser", + "HunyuanA13BToolParser", + "Glm4MoeModelToolParser", ] diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py index 3bca42f42646..504fccfe411c 100644 --- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -22,6 +22,7 @@ @ToolParserManager.register_module("glm4_moe") class Glm4MoeModelToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) self.current_tool_name_sent = False @@ -88,9 +89,9 @@ def _parse_arguments(self, args_text: str) -> str: return json.dumps(args_dict, ensure_ascii=False) def extract_tool_calls( - self, - model_output: str, - request: ChatCompletionRequest, + self, + model_output: str, + request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: # sanity check; avoid unnecessary processing @@ -129,7 +130,8 @@ def extract_tool_calls( )) # Extract content before the first tool call - content = model_output[:model_output.find(self.tool_calls_start_token)] + content = model_output[:model_output.find(self. + tool_calls_start_token)] return ExtractedToolCallInformation( tools_called=bool(tool_calls), tool_calls=tool_calls, @@ -137,21 +139,20 @@ def extract_tool_calls( ) except Exception: - logger.exception( - "Error in extracting tool call from response.") + logger.exception("Error in extracting tool call from response.") return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) def extract_tool_calls_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - request: ChatCompletionRequest, + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, ) -> Union[DeltaMessage, None]: logger.debug("delta_text: %s", delta_text) @@ -194,7 +195,7 @@ def extract_tool_calls_streaming( full_text = current_text + delta_text tool_call_portion = full_text.split( self.tool_call_start_token)[-1].split( - self.tool_call_end_token)[0].strip() + self.tool_call_end_token)[0].strip() delta_text = delta_text.split( self.tool_call_end_token)[0].strip() text_portion = delta_text.split( @@ -237,18 +238,24 @@ def extract_tool_calls_streaming( # Handle any remaining arguments if self.current_tool_id < len(self.prev_tool_call_arr): - current_tool_call = self.prev_tool_call_arr[self.current_tool_id] + current_tool_call = self.prev_tool_call_arr[ + self.current_tool_id] if current_tool_call.get("arguments"): remaining_args = current_tool_call["arguments"] - if remaining_args not in self.streamed_args_for_tool[self.current_tool_id]: - diff = remaining_args[len(self.streamed_args_for_tool[self.current_tool_id]):] + if remaining_args not in self.streamed_args_for_tool[ + self.current_tool_id]: + diff = remaining_args[len( + self.streamed_args_for_tool[self. + current_tool_id]):] if diff: - self.streamed_args_for_tool[self.current_tool_id] += diff + self.streamed_args_for_tool[ + self.current_tool_id] += diff return DeltaMessage(tool_calls=[ DeltaToolCall( index=self.current_tool_id, function=DeltaFunctionCall( - arguments=diff).model_dump(exclude_none=True), + arguments=diff).model_dump( + exclude_none=True), ) ]) return None @@ -266,19 +273,24 @@ def extract_tool_calls_streaming( self.stream_tool_call_portion_regex.match( tool_call_portion)) if current_tool_call_matches: - function_name = current_tool_call_matches.group("function_name") - function_args_xml = current_tool_call_matches.group("function_arguments") or "" + function_name = current_tool_call_matches.group( + "function_name") + function_args_xml = current_tool_call_matches.group( + "function_arguments") or "" current_tool_call['id'] = f"call_{self.current_tool_id}" current_tool_call["name"] = function_name.strip() - current_tool_call["arguments"] = self._parse_arguments(function_args_xml) + current_tool_call["arguments"] = self._parse_arguments( + function_args_xml) else: current_tool_call_name_matches = ( self.stream_tool_call_name_regex.match( tool_call_portion)) if current_tool_call_name_matches: - function_name = current_tool_call_name_matches.group("function_name") - current_tool_call['id'] = f"call_{self.current_tool_id}" + function_name = current_tool_call_name_matches.group( + "function_name") + current_tool_call[ + 'id'] = f"call_{self.current_tool_id}" current_tool_call["name"] = function_name.strip() current_tool_call["arguments"] = "{}" else: @@ -301,7 +313,7 @@ def extract_tool_calls_streaming( id=tool_id, function=DeltaFunctionCall( name=function_name).model_dump( - exclude_none=True), + exclude_none=True), ) ]) else: @@ -356,7 +368,7 @@ def extract_tool_calls_streaming( index=self.current_tool_id, function=DeltaFunctionCall( arguments=cur_arguments).model_dump( - exclude_none=True), + exclude_none=True), ) ]) self.streamed_args_for_tool[ @@ -376,7 +388,7 @@ def extract_tool_calls_streaming( index=self.current_tool_id, function=DeltaFunctionCall( arguments=delta_arguments).model_dump( - exclude_none=True), + exclude_none=True), ) ]) self.streamed_args_for_tool[ @@ -396,4 +408,4 @@ def extract_tool_calls_streaming( except Exception: logger.exception("Error trying to handle streaming tool call.") - return None # do not stream a delta. skip this token ID. \ No newline at end of file + return None # do not stream a delta. skip this token ID. diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 59cbb8430e2c..bae593c1dff0 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -3,10 +3,10 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser from .qwen3_reasoning_parser import Qwen3ReasoningParser -from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser __all__ = [ "ReasoningParser", From cd7ca6226035fa39b00c806cfd0b02d8acc278f5 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 00:30:48 +0800 Subject: [PATCH 38/44] Update glm4_moe_tool_parser.py --- .../tool_parsers/glm4_moe_tool_parser.py | 102 ++++++++---------- 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py index 504fccfe411c..371d8d9576bb 100644 --- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -157,17 +157,15 @@ def extract_tool_calls_streaming( logger.debug("delta_text: %s", delta_text) logger.debug("delta_token_ids: %s", delta_token_ids) - - # check to see if we should be streaming a tool call - if self.tool_call_start_token_id not in current_token_ids: + # check to see if we should be streaming a tool call - is there a + if self.tool_calls_start_token_id not in current_token_ids: logger.debug("No tool call tokens found!") return DeltaMessage(content=delta_text) - - # Remove tool call tokens from delta text for processing - delta_text = delta_text.replace(self.tool_call_start_token, - "").replace(self.tool_call_end_token, + delta_text = delta_text.replace(self.tool_calls_start_token, + "").replace(self.tool_calls_end_token, "") try: + # figure out where we are in the parsing by counting tool call # start & end tags prev_tool_start_count = previous_token_ids.count( @@ -178,7 +176,6 @@ def extract_tool_calls_streaming( self.tool_call_start_token_id) cur_tool_end_count = current_token_ids.count( self.tool_call_end_token_id) - tool_call_portion = None text_portion = None @@ -189,17 +186,16 @@ def extract_tool_calls_streaming( logger.debug("Generating text content! skipping tool parsing.") return DeltaMessage(content=delta_text) - # Handle tool call end if self.tool_call_end_token in delta_text: logger.debug("tool_call_end_token in delta_text") full_text = current_text + delta_text tool_call_portion = full_text.split( self.tool_call_start_token)[-1].split( - self.tool_call_end_token)[0].strip() + self.tool_call_end_token)[0].rstrip() delta_text = delta_text.split( - self.tool_call_end_token)[0].strip() + self.tool_call_end_token)[0].rstrip() text_portion = delta_text.split( - self.tool_call_end_token)[-1].strip() + self.tool_call_end_token)[-1].lstrip() # case -- we're starting a new tool call if (cur_tool_start_count > cur_tool_end_count @@ -209,6 +205,7 @@ def extract_tool_calls_streaming( self.tool_call_start_token)[-1] else: tool_call_portion = None + delta = None text_portion = None @@ -235,30 +232,28 @@ def extract_tool_calls_streaming( logger.debug( "attempting to close tool call, but no tool call") return None - - # Handle any remaining arguments - if self.current_tool_id < len(self.prev_tool_call_arr): - current_tool_call = self.prev_tool_call_arr[ - self.current_tool_id] - if current_tool_call.get("arguments"): - remaining_args = current_tool_call["arguments"] - if remaining_args not in self.streamed_args_for_tool[ - self.current_tool_id]: - diff = remaining_args[len( - self.streamed_args_for_tool[self. - current_tool_id]):] - if diff: - self.streamed_args_for_tool[ - self.current_tool_id] += diff - return DeltaMessage(tool_calls=[ - DeltaToolCall( - index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=diff).model_dump( - exclude_none=True), - ) - ]) - return None + diff = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments") + if diff: + diff = (diff.encode("utf-8").decode("unicode_escape") + if diff is str else diff) + if '"}' not in delta_text: + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", + diff, + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump(exclude_none=True), + ) + ]) # case -- otherwise we're just generating text else: @@ -273,28 +268,23 @@ def extract_tool_calls_streaming( self.stream_tool_call_portion_regex.match( tool_call_portion)) if current_tool_call_matches: - function_name = current_tool_call_matches.group( - "function_name") - function_args_xml = current_tool_call_matches.group( - "function_arguments") or "" - - current_tool_call['id'] = f"call_{self.current_tool_id}" - current_tool_call["name"] = function_name.strip() - current_tool_call["arguments"] = self._parse_arguments( - function_args_xml) + tool_id, tool_args = (current_tool_call_matches.groups()) + tool_name = tool_id.split('.')[1].split(':')[0] + current_tool_call['id'] = tool_id + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = tool_args else: current_tool_call_name_matches = ( self.stream_tool_call_name_regex.match( tool_call_portion)) if current_tool_call_name_matches: - function_name = current_tool_call_name_matches.group( - "function_name") - current_tool_call[ - 'id'] = f"call_{self.current_tool_id}" - current_tool_call["name"] = function_name.strip() - current_tool_call["arguments"] = "{}" + tool_id_str, = current_tool_call_name_matches.groups() + tool_name = tool_id_str.split('.')[1].split(':')[0] + current_tool_call['id'] = tool_id_str + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = "" else: - logger.debug("Not enough tokens to parse tool call") + logger.debug("Not enough token") return None # case - we haven't sent the tool name yet. If it's available, send @@ -354,15 +344,17 @@ def extract_tool_calls_streaming( logger.debug("Skipping text %s - no arguments", delta_text) delta = None - # case -- prev arguments are defined, but none are now. + # case -- prev arguments are defined, but non are now. # probably impossible, but not a fatal error - just keep going elif not cur_arguments and prev_arguments: logger.error("should be impossible to have arguments reset " "mid-call. skipping streaming anything.") delta = None - # case -- we now have the first info about arguments available + # case -- we now have the first info about arguments available from + # autocompleting the JSON elif cur_arguments and not prev_arguments: + delta = DeltaMessage(tool_calls=[ DeltaToolCall( index=self.current_tool_id, @@ -381,7 +373,7 @@ def extract_tool_calls_streaming( and len(cur_arguments) > len(prev_arguments) and cur_arguments.startswith(prev_arguments)): delta_arguments = cur_arguments[len(prev_arguments):] - logger.debug("got diff %s", delta_arguments) + logger.debug("got diff %s", delta_text) delta = DeltaMessage(tool_calls=[ DeltaToolCall( From 7b369704e81123cc53e16335974cf64e518f145b Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 00:44:46 +0800 Subject: [PATCH 39/44] 2 --- .../openai/tool_parsers/glm4_moe_tool_parser.py | 5 ++--- vllm/reasoning/glm4_moe_reasoning_parser.py | 14 +++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py index 371d8d9576bb..c3f9d7923575 100644 --- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -32,7 +32,6 @@ def __init__(self, tokenizer: AnyTokenizer): self.tool_call_start_token = "" self.tool_call_end_token = "" - # This is the key fix - we need to set the tool_calls_start_token used by the base class self.tool_calls_start_token = self.tool_call_start_token # Updated regex for the XML-based format @@ -158,11 +157,11 @@ def extract_tool_calls_streaming( logger.debug("delta_text: %s", delta_text) logger.debug("delta_token_ids: %s", delta_token_ids) # check to see if we should be streaming a tool call - is there a - if self.tool_calls_start_token_id not in current_token_ids: + if self.tool_call_start_token_id not in current_token_ids: logger.debug("No tool call tokens found!") return DeltaMessage(content=delta_text) delta_text = delta_text.replace(self.tool_calls_start_token, - "").replace(self.tool_calls_end_token, + "").replace(self.tool_call_end_token, "") try: diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py index bc9e8b7abf82..6511fb49d10e 100644 --- a/vllm/reasoning/glm4_moe_reasoning_parser.py +++ b/vllm/reasoning/glm4_moe_reasoning_parser.py @@ -19,11 +19,11 @@ class Glm4MoeModelReasoningParser(ReasoningParser): """ Reasoning parser for the Glm4MoeModel model. - The Glm4MoeModel model uses ... tokens to denote reasoning text - within its output. The model provides a strict switch to disable reasoning - output via the 'enable_thinking=False' parameter. This parser extracts the - reasoning content enclosed by and tokens from the model's - output. + The Glm4MoeModel model uses ... tokens to denote reasoning + text within its output. The model provides a strict switch to disable + reasoning output via the 'enable_thinking=False' parameter. This parser + extracts the reasoning content enclosed by and tokens + from the model's output. """ def __init__(self, tokenizer: PreTrainedTokenizerBase): @@ -41,8 +41,8 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase): if (self.think_start_token_id is None or self.think_end_token_id is None): raise RuntimeError( - "Glm4MoeModel reasoning parser could not locate think start/end " - "tokens in the tokenizer!") + "Glm4MoeModel reasoning parser could not locate " + "think start/end tokens in the tokenizer!") def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.think_end_token_id in input_ids From 554e4675e9899157f2c804581cad3d46887fb927 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 13:49:57 +0800 Subject: [PATCH 40/44] update --- tests/models/registry.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 72d5eb8932a7..6152244a46d4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -363,7 +363,9 @@ def check_available_online( trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 - "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", min_transformers_version="4.54"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", + min_transformers_version="4.54", + is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 @@ -486,7 +488,8 @@ def check_available_online( speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5", - speculative_model="THUDM/GLM-4.5"), + speculative_model="THUDM/GLM-4.5", + is_available_online=False), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL") From c82943525b9e3d9367273f1059c1debda55e0add Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 13:55:20 +0800 Subject: [PATCH 41/44] Update registry.py --- tests/models/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/registry.py b/tests/models/registry.py index 6152244a46d4..d86fcef723e6 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -489,6 +489,7 @@ def check_available_online( tokenizer="openbmb/MiniCPM-2B-sft-bf16"), "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5", speculative_model="THUDM/GLM-4.5", + min_transformers_version="4.54", is_available_online=False), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, From 1065a1376332297e9293c967ddb7d5135ae8ce82 Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 17:24:52 +0800 Subject: [PATCH 42/44] add glm_4_moe --- vllm/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 68ec2427f639..5908b816a18f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2694,7 +2694,10 @@ def __post_init__(self): (self.target_model_config.hf_text_config.model_type \ == "deepseek_v3" or self.target_model_config.hf_text_config.model_type \ - == "mimo"): + == "mimo" or + self.target_model_config.hf_text_config.model_type \ + == "glm4_moe" + ): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): From 3973a74a1cc261ea3453ae0a37e391d42825ee1e Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 21:33:03 +0800 Subject: [PATCH 43/44] 1 --- vllm/config.py | 96 +++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 59 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 84bb661ebd9b..96f27ced3605 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2009,6 +2009,19 @@ def has_unfinished_dp(dp_group: "ProcessGroup", aggregated_has_unfinished = bool(tensor.item()) return aggregated_has_unfinished + @staticmethod + def sync_kv_cache_memory_size(dp_group: "ProcessGroup", + kv_cache_memory: int) -> int: + if kv_cache_memory == -1: + kv_cache_memory = torch.iinfo(torch.int64).max + tensor = torch.tensor([kv_cache_memory], + dtype=torch.int64, + device="cpu") + # we cannot use broadcast for stateless dp group since it depends + # on global rank + torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group) + return tensor.item() + def compute_hash(self): """ Provide a hash that uniquely identifies all the configs @@ -2524,8 +2537,6 @@ def __post_init__(self): SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", "mlp_speculator", "draft_model", "deepseek_mtp"] -SpeculativeAcceptanceMethod = Literal["rejection_sampler", - "typical_acceptance_sampler"] @config @@ -2548,13 +2559,6 @@ class SpeculativeConfig: If using `ngram` method, the related configuration `prompt_lookup_max` and `prompt_lookup_min` should be considered.""" - acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler" - """The method to use for accepting draft tokens:\n - - "rejection_sampler" maps to `RejectionSampler`.\n - - "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`. - - If using `typical_acceptance_sampler`, the related configuration - `posterior_threshold` and `posterior_alpha` should be considered.""" draft_tensor_parallel_size: Optional[int] = None """The degree of the tensor parallelism for the draft model. Can only be 1 or the same as the target model's tensor parallel size.""" @@ -2581,9 +2585,6 @@ class SpeculativeConfig: will use the default version.""" # Advanced control - disable_mqa_scorer: bool = False - """Disable the MQA scorer and fall back to batch expansion for scoring - proposals.""" disable_by_batch_size: Optional[int] = None """Disable speculative decoding for new incoming requests when the number of enqueued requests is larger than this value, if provided.""" @@ -2596,16 +2597,6 @@ class SpeculativeConfig: """Minimum size of ngram token window when using Ngram proposer, if provided. Defaults to 1.""" - # Typical acceptance sampler configuration - posterior_threshold: Optional[float] = None - """A threshold value that sets a lower bound on the posterior probability - of a token in the target model for it to be accepted. This threshold is - used only when we use the `TypicalAcceptanceSampler` for token acceptance. - """ - posterior_alpha: Optional[float] = None - """Scaling factor for entropy-based threshold, applied when using - `TypicalAcceptanceSampler`.""" - speculative_token_tree: Optional[str] = None """Specifies the tree structure for speculative token generation. """ @@ -2673,7 +2664,15 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "n_predict": n_predict, "architectures": ["MiMoMTPModel"] }) - return hf_config + + if hf_config.architectures[0] == "Glm4MoeForCausalLM": + hf_config.model_type = "glm4_moe_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["Glm4MoeMTP"] + }) return hf_config @@ -2694,10 +2693,7 @@ def __post_init__(self): (self.target_model_config.hf_text_config.model_type \ == "deepseek_v3" or self.target_model_config.hf_text_config.model_type \ - == "mimo" or - self.target_model_config.hf_text_config.model_type \ - == "glm4_moe" - ): + == "mimo"): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): @@ -2786,8 +2782,8 @@ def __post_init__(self): elif (self.draft_model_config.hf_config.model_type == "mlp_speculator"): self.method = "mlp_speculator" - elif (self.draft_model_config.hf_config.model_type == - "deepseek_mtp"): + elif (self.draft_model_config.hf_config.model_type + in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")): self.method = "deepseek_mtp" if self.num_speculative_tokens > 1: logger.warning( @@ -2797,6 +2793,11 @@ def __post_init__(self): ) else: self.method = "draft_model" + raise NotImplementedError( + "Speculative decoding with draft model is not " + "supported yet. Please consider using other " + "speculative decoding methods such as ngram, medusa, " + "eagle, or deepseek_mtp.") # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): @@ -2855,12 +2856,6 @@ def __post_init__(self): self.target_parallel_config, self.draft_tensor_parallel_size)) - if self.acceptance_method == "typical_acceptance_sampler": - if self.posterior_threshold is None: - self.posterior_threshold = 0.09 - if self.posterior_alpha is None: - self.posterior_alpha = 0.3 - @staticmethod def _maybe_override_draft_max_model_len( speculative_max_model_len: Optional[int], @@ -2966,30 +2961,6 @@ def _verify_args(self) -> Self: if self.draft_model_config: self.draft_model_config.verify_with_parallel_config( self.draft_parallel_config) - # Validate and set draft token acceptance related settings. - - if self.acceptance_method is None: - raise ValueError("acceptance_method is not set. " - "Expected values are rejection_sampler or " - "typical_acceptance_sampler.") - - if (self.acceptance_method != 'rejection_sampler' - and self.acceptance_method != 'typical_acceptance_sampler'): - raise ValueError( - "Expected acceptance_method to be either " - "rejection_sampler or typical_acceptance_sampler. Instead it " - f"is {self.acceptance_method}") - - if self.acceptance_method == "typical_acceptance_sampler" and ( - (self.posterior_threshold is not None - and self.posterior_threshold < 0) or - (self.posterior_alpha is not None and self.posterior_alpha < 0)): - raise ValueError( - "Expected the posterior_threshold and posterior_alpha of " - "typical_acceptance_sampler to be > 0. " - "Instead found posterior_threshold = " - f"{self.posterior_threshold} and posterior_alpha = " - f"{self.posterior_alpha}") if (self.disable_by_batch_size is not None and self.disable_by_batch_size < 2): @@ -4701,6 +4672,13 @@ def __post_init__(self): if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True + if self.model_config is not None and \ + self.model_config.attention_chunk_size is not None and \ + self.speculative_config is not None and \ + self.speculative_config.use_eagle(): + # Hybrid KV cache manager is not yet supported with chunked + # local attention + eagle. + self.scheduler_config.disable_hybrid_kv_cache_manager = True def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: From 6618ce6aebaaec0c5ea7f7b6e414e9a560a9009f Mon Sep 17 00:00:00 2001 From: zRzRzRzRzRzRzR <2448370773@qq.com> Date: Sat, 19 Jul 2025 21:47:18 +0800 Subject: [PATCH 44/44] Update config.py --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 96f27ced3605..e92c501012a4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2671,7 +2671,7 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update({ "num_hidden_layers": 0, "n_predict": n_predict, - "architectures": ["Glm4MoeMTP"] + "architectures": ["Glm4MoeMTPModel"] }) return hf_config