diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4e5abea8e744..ac02e9bded49 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -738,6 +738,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ |
+| `Qwen3_5ForConditionalGeneration` | Qwen3.5 | T + IE+ + VE+ | `Qwen/Qwen3.5-9B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `Qwen3_5MoeForConditionalGeneration` | Qwen3.5-MOE | T + IE+ + VE+ | `Qwen/Qwen3.5-35B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f688985ce404..d2c67cf7e709 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -970,6 +970,26 @@ def check_available_online(
max_model_len=4096,
min_transformers_version="4.57",
),
+ "Qwen3_5ForConditionalGeneration": _HfExamplesInfo(
+ "Qwen/Qwen3.5-9B-Instruct",
+ max_model_len=4096,
+ min_transformers_version="5.1.0",
+ ),
+ "Qwen3_5MoeForConditionalGeneration": _HfExamplesInfo(
+ "Qwen/Qwen3.5-35B-A3B-Instruct",
+ max_model_len=4096,
+ min_transformers_version="5.1.0",
+ ),
+ "Qwen3_5MTP": _HfExamplesInfo(
+ "Qwen/Qwen3.5-9B-Instruct",
+ speculative_model="Qwen/Qwen3.5-9B-Instruct",
+ min_transformers_version="5.1.0",
+ ),
+ "Qwen3_5MoeMTP": _HfExamplesInfo(
+ "Qwen/Qwen3.5-35B-A3B-Instruct",
+ speculative_model="Qwen/Qwen3.5-35B-A3B-Instruct",
+ min_transformers_version="5.1.0",
+ ),
"Qwen3OmniMoeForConditionalGeneration": _HfExamplesInfo(
"Qwen/Qwen3-Omni-30B-A3B-Instruct",
max_model_len=4096,
diff --git a/vllm/config/model.py b/vllm/config/model.py
index a359df37484e..b76d5186835b 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1218,8 +1218,8 @@ def get_num_layers_by_block_type(
if attn_type_list:
return sum(t == 1 for t in attn_type_list[start:end])
- # Hybrid model Qwen3Next
- layer_types_value = getattr(self.hf_config, "layer_types", None)
+ # Hybrid model Qwen3Next Qwen3.5 Series
+ layer_types_value = getattr(self.hf_text_config, "layer_types", None)
if layer_types_value is not None:
if block_type == "attention":
return sum(
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 8a54dbb6dbb1..8117349d84b6 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -37,6 +37,7 @@
"ernie_mtp",
"exaone_moe_mtp",
"qwen3_next_mtp",
+ "qwen3_5_mtp",
"longcat_flash_mtp",
"mtp",
"pangu_ultra_moe_mtp",
@@ -263,6 +264,16 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
{"n_predict": n_predict, "architectures": ["ExaoneMoeMTP"]}
)
+ if hf_config.model_type in ("qwen3_5", "qwen3_5_moe"):
+ is_moe = hf_config.model_type == "qwen3_5_moe"
+ hf_config.model_type = "qwen3_5_mtp"
+ n_predict = getattr(hf_config, "mtp_num_hidden_layers", None)
+ hf_config.update(
+ {
+ "n_predict": n_predict,
+ "architectures": ["Qwen3_5MoeMTP" if is_moe else "Qwen3_5MTP"],
+ }
+ )
if hf_config.model_type == "longcat_flash":
hf_config.model_type = "longcat_flash_mtp"
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index f92ecb6b5b4e..347ce139e906 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -43,7 +43,8 @@ def get_state_dtype(self) -> tuple[torch.dtype, ...]:
def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
if (
vllm_config.speculative_config is not None
- and vllm_config.model_config.hf_config.model_type not in ["qwen3_next"]
+ and vllm_config.model_config.hf_config.model_type
+ not in ["qwen3_next", "qwen3_5", "qwen3_5_moe"]
):
raise NotImplementedError(
"Mamba with speculative decoding is not supported yet."
diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
new file mode 100644
index 000000000000..d6df7523bda2
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -0,0 +1,993 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The Qwen Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3.5 Series compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+
+import torch
+from einops import rearrange
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.models.qwen3_5.configuration_qwen3_5 import (
+ Qwen3_5Config,
+ Qwen3_5TextConfig,
+)
+from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import (
+ Qwen3_5MoeConfig,
+ Qwen3_5MoeTextConfig,
+)
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (
+ CacheConfig,
+ ModelConfig,
+ SpeculativeConfig,
+ VllmConfig,
+ get_current_vllm_config,
+)
+from vllm.distributed import (
+ divide,
+ get_pp_group,
+ get_tensor_model_parallel_rank,
+ get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import (
+ GemmaRMSNorm as Qwen3_5RMSNorm,
+)
+from vllm.model_executor.layers.layernorm import RMSNormGated
+from vllm.model_executor.layers.linear import (
+ ColumnParallelLinear,
+ MergedColumnParallelLinear,
+ RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+ mamba_v2_sharded_weight_loader,
+)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+ MambaStateCopyFunc,
+ MambaStateCopyFuncCalculator,
+ MambaStateDtypeCalculator,
+ MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+ ParallelLMHead,
+ VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+ default_weight_loader,
+ sharded_weight_loader,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (
+ HasInnerState,
+ IsHybrid,
+ MixtureOfExperts,
+ MultiModalEmbeddings,
+ SupportsLoRA,
+ SupportsPP,
+ _require_is_multimodal,
+)
+from .qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP
+from .qwen3_next import (
+ Qwen3NextAttention,
+ Qwen3NextDecoderLayer,
+ Qwen3NextGatedDeltaNet,
+ Qwen3NextModel,
+ Qwen3NextSparseMoeBlock,
+ QwenNextMixtureOfExperts,
+)
+from .qwen3_vl import (
+ Qwen3_VisionTransformer,
+ Qwen3VLDummyInputsBuilder,
+ Qwen3VLForConditionalGeneration,
+ Qwen3VLMultiModalProcessor,
+ Qwen3VLProcessingInfo,
+)
+from .utils import (
+ AutoWeightsLoader,
+ PPMissingLayer,
+ _merge_multimodal_embeddings,
+ extract_layer_index,
+ is_pp_missing_parameter,
+ make_empty_intermediate_tensors_factory,
+ make_layers,
+ maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class Qwen3_5ProcessingInfo(Qwen3VLProcessingInfo):
+ def get_hf_config(self):
+ return self.ctx.get_hf_config(Qwen3_5Config)
+
+
+class Qwen3_5MoeProcessingInfo(Qwen3VLProcessingInfo):
+ def get_hf_config(self):
+ return self.ctx.get_hf_config(Qwen3_5MoeConfig)
+
+
+class Qwen3_5GatedDeltaNet(Qwen3NextGatedDeltaNet):
+ def __init__(
+ self,
+ config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig,
+ model_config: ModelConfig | None = None,
+ cache_config: CacheConfig | None = None,
+ quant_config: QuantizationConfig | None = None,
+ speculative_config: SpeculativeConfig | None = None,
+ prefix: str = "",
+ ) -> None:
+ super(Qwen3NextGatedDeltaNet, self).__init__()
+ self.tp_size = get_tensor_model_parallel_world_size()
+ self.tp_rank = get_tensor_model_parallel_rank()
+ self.hidden_size = config.hidden_size
+ self.num_v_heads = config.linear_num_value_heads
+ self.num_k_heads = config.linear_num_key_heads
+ self.head_k_dim = config.linear_key_head_dim
+ self.head_v_dim = config.linear_value_head_dim
+ self.key_dim = self.head_k_dim * self.num_k_heads
+ self.value_dim = self.head_v_dim * self.num_v_heads
+
+ self.conv_kernel_size = config.linear_conv_kernel_dim
+ self.layer_idx = extract_layer_index(prefix)
+ self.activation = config.hidden_act
+ self.act = ACT2FN[config.hidden_act]
+ self.layer_norm_epsilon = config.rms_norm_eps
+ self.prefix = prefix
+
+ self.config = config
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.quant_config = quant_config
+ self.speculative_config = speculative_config
+ self.num_spec = (
+ self.speculative_config.num_speculative_tokens
+ if self.speculative_config
+ else 0
+ )
+
+ # QKV
+ self.conv_dim = self.key_dim * 2 + self.value_dim
+ self.conv1d = ColumnParallelLinear(
+ input_size=self.conv_kernel_size,
+ output_size=self.conv_dim,
+ bias=False,
+ prefix=f"{prefix}.conv1d",
+ )
+ self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+ self.in_proj_qkv = MergedColumnParallelLinear(
+ input_size=self.hidden_size,
+ output_sizes=[self.key_dim, self.key_dim, self.value_dim],
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.in_proj_qkv",
+ )
+ self.in_proj_z = ColumnParallelLinear(
+ input_size=self.hidden_size,
+ output_size=self.value_dim,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.in_proj_z",
+ )
+ self.in_proj_b = ColumnParallelLinear(
+ input_size=self.hidden_size,
+ output_size=self.num_v_heads,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.in_proj_ba",
+ )
+ self.in_proj_a = ColumnParallelLinear(
+ input_size=self.hidden_size,
+ output_size=self.num_v_heads,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.in_proj_a",
+ )
+
+ query_key_settings = (self.key_dim, 0, False)
+ value_settings = (self.value_dim, 0, False)
+
+ delattr(self.conv1d.weight, "weight_loader")
+ set_weight_attrs(
+ self.conv1d.weight,
+ {
+ "weight_loader": mamba_v2_sharded_weight_loader(
+ [
+ query_key_settings,
+ query_key_settings,
+ value_settings,
+ ],
+ self.tp_size,
+ self.tp_rank,
+ )
+ },
+ )
+
+ # selective projection used to make dt, B and C input dependant
+
+ # time step projection (discretization)
+ # instantiate once and copy inv_dt in init_weights of PretrainedModel
+ self.dt_bias = nn.Parameter(
+ torch.ones(self.num_v_heads // self.tp_size),
+ )
+ self.A_log = nn.Parameter(
+ torch.empty(
+ divide(self.num_v_heads, self.tp_size),
+ )
+ )
+
+ set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(0)})
+ set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+
+ self.norm = RMSNormGated(
+ self.head_v_dim,
+ eps=self.layer_norm_epsilon,
+ group_size=None,
+ norm_before_gate=True,
+ device=current_platform.current_device(),
+ dtype=config.dtype,
+ )
+
+ self.out_proj = RowParallelLinear(
+ self.value_dim,
+ self.hidden_size,
+ bias=False,
+ input_is_parallel=True,
+ quant_config=quant_config,
+ prefix=f"{prefix}.out_proj",
+ )
+
+ compilation_config = get_current_vllm_config().compilation_config
+ if prefix in compilation_config.static_forward_context:
+ raise ValueError(f"Duplicate layer name: {prefix}")
+ compilation_config.static_forward_context[prefix] = self
+
+ def fix_query_key_value_ordering(
+ self,
+ mixed_qkv,
+ z,
+ b,
+ a,
+ ):
+ raise NotImplementedError(
+ "Qwen3.5 Series dont need to fix query key value ordering"
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ output: torch.Tensor,
+ ):
+ """
+ Forward pass with three parts:
+ 1. Input projection
+ 2. Core attention (custom op)
+ 3. Output projection
+ """
+ num_tokens = hidden_states.size(0)
+
+ # ============================================================
+ # Part 1: Input Projection
+ # ============================================================
+ mixed_qkv, _ = self.in_proj_qkv(hidden_states)
+ z, _ = self.in_proj_z(hidden_states)
+ z = z.reshape(z.size(0), -1, self.head_v_dim)
+ b, _ = self.in_proj_b(hidden_states)
+ a, _ = self.in_proj_a(hidden_states)
+
+ b = b.contiguous()
+ a = a.contiguous()
+
+ # ============================================================
+ # Part 2: Core Attention (Custom Op)
+ # ============================================================
+ # Note: we should not use torch.empty here like other attention backends,
+ # see discussions in https://github.com/vllm-project/vllm/pull/28182
+ core_attn_out = torch.zeros(
+ (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+ dtype=hidden_states.dtype,
+ device=hidden_states.device,
+ )
+
+ torch.ops.vllm.gdn_attention_core(
+ mixed_qkv,
+ b,
+ a,
+ core_attn_out,
+ self.prefix,
+ )
+
+ # ============================================================
+ # Part 3: Output Projection
+ # ============================================================
+ z_shape_og = z.shape
+ # Reshape input data into 2D tensor
+ core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+ z = z.reshape(-1, z.shape[-1])
+ core_attn_out = self.norm(core_attn_out, z)
+ core_attn_out = core_attn_out.reshape(z_shape_og)
+ core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+ output[:num_tokens], _ = self.out_proj(core_attn_out)
+
+
+class Qwen3_5DecoderLayer(Qwen3NextDecoderLayer):
+ def __init__(
+ self,
+ vllm_config: VllmConfig,
+ layer_type: str,
+ prefix: str = "",
+ ) -> None:
+ super(Qwen3NextDecoderLayer, self).__init__()
+
+ config = vllm_config.model_config.hf_text_config
+ model_config = vllm_config.model_config
+ cache_config = vllm_config.cache_config
+ quant_config = vllm_config.quant_config
+ speculative_config = vllm_config.speculative_config
+
+ self.layer_type = layer_type
+ self.layer_idx = extract_layer_index(prefix)
+
+ if self.layer_type == "linear_attention":
+ self.linear_attn = Qwen3_5GatedDeltaNet(
+ config,
+ model_config=model_config,
+ cache_config=cache_config,
+ quant_config=quant_config,
+ speculative_config=speculative_config,
+ prefix=f"{prefix}.linear_attn",
+ )
+ elif self.layer_type == "full_attention":
+ self.self_attn = Qwen3NextAttention(
+ config,
+ model_config=model_config,
+ cache_config=cache_config,
+ quant_config=quant_config,
+ prefix=f"{prefix}.self_attn",
+ )
+ else:
+ raise ValueError(f"Invalid layer_type {self.layer_type}")
+
+ # NOTE: Determine the MLP type based on the model type
+ # Qwen3.5 use all layers for MLP / Qwen3.5-MoE use sparse MoE blocks
+ if config.model_type == "qwen3_5_moe_text":
+ self.mlp = Qwen3NextSparseMoeBlock(
+ vllm_config=vllm_config,
+ prefix=f"{prefix}.mlp",
+ )
+ elif config.model_type == "qwen3_5_text":
+ self.mlp = Qwen3NextMLP(
+ hidden_size=config.hidden_size,
+ intermediate_size=config.intermediate_size,
+ hidden_act=config.hidden_act,
+ quant_config=quant_config,
+ prefix=f"{prefix}.mlp",
+ )
+ else:
+ raise ValueError(f"Invalid model_type {config.model_type}")
+
+ self.input_layernorm = Qwen3_5RMSNorm(
+ config.hidden_size, eps=config.rms_norm_eps
+ )
+ self.post_attention_layernorm = Qwen3_5RMSNorm(
+ config.hidden_size, eps=config.rms_norm_eps
+ )
+
+ self.layer_scale = getattr(config, "layer_scale", False)
+ if self.layer_scale:
+ self.attn_layer_scale = torch.nn.Parameter(
+ torch.zeros(
+ 1,
+ 1,
+ config.hidden_size,
+ dtype=config.dtype,
+ ),
+ )
+ self.ffn_layer_scale = torch.nn.Parameter(
+ torch.zeros(
+ 1,
+ 1,
+ config.hidden_size,
+ dtype=config.dtype,
+ ),
+ )
+
+
+@support_torch_compile(
+ dynamic_arg_dims={
+ "input_ids": 0,
+ # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+ # otherwise (seq_len, ).
+ "positions": -1,
+ "intermediate_tensors": 0,
+ "inputs_embeds": 0,
+ }
+)
+class Qwen3_5Model(Qwen3NextModel):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ super(Qwen3NextModel, self).__init__()
+
+ config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig = (
+ vllm_config.model_config.hf_text_config
+ )
+ parallel_config = vllm_config.parallel_config
+
+ eplb_config = parallel_config.eplb_config
+ self.num_redundant_experts = eplb_config.num_redundant_experts
+
+ self.config = config
+
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = VocabParallelEmbedding(
+ self.vocab_size,
+ config.hidden_size,
+ )
+
+ def get_layer(prefix: str):
+ return Qwen3_5DecoderLayer(
+ vllm_config,
+ layer_type=config.layer_types[extract_layer_index(prefix)],
+ prefix=prefix,
+ )
+
+ self.start_layer, self.end_layer, self.layers = make_layers(
+ config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+ )
+ self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+ ["hidden_states", "residual"], config.hidden_size
+ )
+
+ if get_pp_group().is_last_rank:
+ self.norm = Qwen3_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ else:
+ self.norm = PPMissingLayer()
+
+ def load_fused_expert_weights(
+ self,
+ name: str,
+ params_dict: dict,
+ loaded_weight: torch.Tensor,
+ shard_id: str,
+ num_experts: int,
+ ) -> bool:
+ param = params_dict[name]
+ weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+ loaded_local_expert = False
+ for expert_id in range(num_experts):
+ curr_expert_weight = loaded_weight[expert_id]
+ success = weight_loader(
+ param,
+ curr_expert_weight,
+ name,
+ shard_id,
+ expert_id,
+ return_success=True,
+ )
+ if success:
+ loaded_local_expert = True
+
+ return loaded_local_expert
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+
+ params_dict = dict(self.named_parameters())
+ loaded_params: set[str] = set()
+ expert_params_mapping = self.get_expert_mapping()
+ is_fused_expert = False
+ fused_expert_params_mapping = [
+ ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+ ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+ ]
+ num_experts = (
+ self.config.num_experts if hasattr(self.config, "num_experts") else 0
+ )
+ for name, loaded_weight in weights:
+ if "rotary_emb.inv_freq" in name:
+ continue
+
+ if name.startswith("mtp."):
+ continue
+
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+ is_fused_expert = True
+ expert_params_mapping = fused_expert_params_mapping
+
+ if weight_name not in name:
+ continue
+
+ if "mlp.experts" in name:
+ continue
+
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ # Skip layers on other devices.
+ if is_pp_missing_parameter(name, self):
+ continue
+ # name = apply_attn_prefix(name, params_dict)
+ if name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, loaded_weight, shard_id)
+ break
+ else:
+ is_expert_weight = False
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in name:
+ continue
+ is_expert_weight = True
+ name_mapped = name.replace(weight_name, param_name)
+ # Skip layers on other devices.
+ if is_pp_missing_parameter(name_mapped, self):
+ continue
+ if is_fused_expert:
+ # qwen3.5 no need to transpose
+ # loaded_weight = loaded_weight.transpose(-1, -2)
+ if "experts.gate_up_proj" in name:
+ loaded_weight = loaded_weight.chunk(2, dim=-2)
+ success_w1 = self.load_fused_expert_weights(
+ name_mapped,
+ params_dict,
+ loaded_weight[0],
+ "w1",
+ num_experts,
+ )
+ success_w3 = self.load_fused_expert_weights(
+ name_mapped,
+ params_dict,
+ loaded_weight[1],
+ "w3",
+ num_experts,
+ )
+ success = success_w1 and success_w3
+ else:
+ # down_proj
+ success = self.load_fused_expert_weights(
+ name_mapped,
+ params_dict,
+ loaded_weight,
+ shard_id,
+ num_experts,
+ )
+ if success:
+ name = name_mapped
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if (
+ name_mapped.endswith(".bias")
+ or name_mapped.endswith("_bias")
+ ) and name_mapped not in params_dict:
+ continue
+ param = params_dict[name_mapped]
+ weight_loader = param.weight_loader
+ success = weight_loader(
+ param,
+ loaded_weight,
+ name_mapped,
+ shard_id=shard_id,
+ expert_id=expert_id,
+ return_success=True,
+ )
+ if success:
+ name = name_mapped
+ break
+ else:
+ if is_expert_weight:
+ # We've checked that this is an expert weight
+ # However it's not mapped locally to this rank
+ # So we simply skip it
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ if is_pp_missing_parameter(name, self):
+ continue
+ if name not in params_dict:
+ logger.warning_once(
+ f"Parameter {name} not found in params_dict, skip loading"
+ )
+ continue
+ param = params_dict[name]
+ weight_loader = getattr(
+ param, "weight_loader", default_weight_loader
+ )
+ weight_loader(param, loaded_weight)
+ loaded_params.add(name)
+ return loaded_params
+
+
+class Qwen3_5ForCausalLMBase(
+ nn.Module,
+ HasInnerState,
+ SupportsLoRA,
+ SupportsPP,
+):
+ packed_modules_mapping = {
+ "qkv_proj": [
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ ],
+ "gate_up_proj": ["gate_proj", "up_proj"],
+ }
+
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ config = vllm_config.model_config.hf_text_config
+ self.vllm_config = vllm_config
+ self.model_config = vllm_config.model_config
+ cache_config = vllm_config.cache_config
+
+ scheduler_config = vllm_config.scheduler_config
+ if cache_config.mamba_cache_mode == "all":
+ raise NotImplementedError(
+ "Qwen3.5 currently does not support 'all' prefix caching, "
+ "please use '--mamba-cache-mode=align' instead"
+ )
+ self.quant_config = vllm_config.quant_config
+
+ super().__init__()
+ self.config = config
+ self.scheduler_config = scheduler_config
+ self.model = Qwen3_5Model(
+ vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+ )
+
+ if get_pp_group().is_last_rank:
+ if config.tie_word_embeddings:
+ self.lm_head = self.model.embed_tokens
+ else:
+ self.lm_head = ParallelLMHead(
+ config.vocab_size,
+ config.hidden_size,
+ prefix=maybe_prefix(prefix, "lm_head"),
+ )
+ else:
+ self.lm_head = PPMissingLayer()
+
+ self.logits_processor = LogitsProcessor(config.vocab_size)
+ self.make_empty_intermediate_tensors = (
+ self.model.make_empty_intermediate_tensors
+ )
+
+ def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+ return self.model.embed_input_ids(input_ids)
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ intermediate_tensors: IntermediateTensors | None = None,
+ inputs_embeds: torch.Tensor | None = None,
+ **kwargs: object,
+ ):
+ hidden_states = self.model(
+ input_ids, positions, intermediate_tensors, inputs_embeds
+ )
+
+ return hidden_states
+
+ def compute_logits(
+ self,
+ hidden_states: torch.Tensor,
+ ) -> torch.Tensor | None:
+ return self.logits_processor(self.lm_head, hidden_states)
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ loader = AutoWeightsLoader(
+ self,
+ skip_prefixes=["mtp."],
+ )
+ return loader.load_weights(weights)
+
+
+class Qwen3_5ForCausalLM(Qwen3_5ForCausalLMBase):
+ pass
+
+
+class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLMBase, QwenNextMixtureOfExperts):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+ # set MoE hyperparameters
+ self.set_moe_parameters()
+
+ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+ return self.model.get_expert_mapping()
+
+
+########################################################
+# Qwen3_5-Dense
+########################################################
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+ Qwen3VLMultiModalProcessor,
+ info=Qwen3_5ProcessingInfo,
+ dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
+class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ # protocols have not __init__ method, so we need to use nn.Module.__init__
+ nn.Module.__init__(self)
+ config: Qwen3_5Config = vllm_config.model_config.hf_config
+ quant_config = vllm_config.quant_config
+ multimodal_config = vllm_config.model_config.multimodal_config
+
+ self.config = config
+ self.multimodal_config = multimodal_config
+ self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+ self.video_pruning_rate = multimodal_config.video_pruning_rate
+ self.is_multimodal_pruning_enabled = (
+ multimodal_config.is_multimodal_pruning_enabled()
+ )
+
+ with self._mark_tower_model(vllm_config, {"image", "video"}):
+ self.visual = Qwen3_VisionTransformer(
+ config.vision_config,
+ norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+ quant_config=quant_config,
+ prefix=maybe_prefix(prefix, "visual"),
+ )
+
+ with self._mark_language_model(vllm_config):
+ self.language_model = Qwen3_5ForCausalLM(
+ vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
+ )
+
+ self.make_empty_intermediate_tensors = (
+ self.language_model.make_empty_intermediate_tensors
+ )
+
+ def embed_input_ids(
+ self,
+ input_ids: torch.Tensor,
+ multimodal_embeddings: MultiModalEmbeddings | None = None,
+ *,
+ is_multimodal: torch.Tensor | None = None,
+ handle_oov_mm_token: bool = False,
+ ) -> torch.Tensor:
+ inputs_embeds = self._embed_text_input_ids(
+ input_ids,
+ self.language_model.embed_input_ids,
+ is_multimodal=is_multimodal,
+ handle_oov_mm_token=handle_oov_mm_token,
+ )
+
+ if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+ return inputs_embeds
+
+ is_multimodal = _require_is_multimodal(is_multimodal)
+
+ inputs_embeds = _merge_multimodal_embeddings(
+ inputs_embeds=inputs_embeds,
+ multimodal_embeddings=multimodal_embeddings,
+ is_multimodal=is_multimodal,
+ )
+
+ return inputs_embeds
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ intermediate_tensors: IntermediateTensors | None = None,
+ inputs_embeds: torch.Tensor | None = None,
+ **kwargs: object,
+ ) -> torch.Tensor | IntermediateTensors:
+ """Run forward pass for Qwen3.5.
+
+ Args:
+ input_ids: Flattened (concatenated) input_ids corresponding to a
+ batch.
+ positions: Flattened (concatenated) position ids corresponding to a
+ batch.
+ **NOTE**: If mrope is enabled (default setting for Qwen3VL
+ opensource models), the shape will be `(3, seq_len)`,
+ otherwise it will be `(seq_len,).
+ intermediate_tensors: Intermediate tensors from previous pipeline
+ stages.
+ inputs_embeds: Pre-computed input embeddings.
+ **kwargs: Additional keyword arguments including:
+ - pixel_values: Pixel values to be fed to a model.
+ `None` if no images are passed.
+ - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
+ LLM. `None` if no images are passed.
+ - pixel_values_videos: Pixel values of videos to be fed to a
+ model. `None` if no videos are passed.
+ - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
+ LLM. `None` if no videos are passed.
+ """
+
+ if intermediate_tensors is not None:
+ inputs_embeds = None
+
+ hidden_states = self.language_model.model(
+ input_ids=input_ids,
+ positions=positions,
+ intermediate_tensors=intermediate_tensors,
+ inputs_embeds=inputs_embeds,
+ )
+
+ return hidden_states
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ loader = AutoWeightsLoader(
+ self,
+ skip_prefixes=["mtp."],
+ )
+ return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+ @classmethod
+ def get_mamba_state_dtype_from_config(
+ cls,
+ vllm_config: "VllmConfig",
+ ) -> tuple[torch.dtype, torch.dtype]:
+ return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
+ vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype
+ )
+
+ @classmethod
+ def get_mamba_state_shape_from_config(
+ cls, vllm_config: "VllmConfig"
+ ) -> tuple[tuple[int, int], tuple[int, int]]:
+ parallel_config = vllm_config.parallel_config
+ hf_config = vllm_config.model_config.hf_text_config
+ tp_size = parallel_config.tensor_parallel_size
+ num_spec = (
+ vllm_config.speculative_config.num_speculative_tokens
+ if vllm_config.speculative_config
+ else 0
+ )
+ return MambaStateShapeCalculator.gated_delta_net_state_shape(
+ tp_size,
+ hf_config.linear_num_key_heads,
+ hf_config.linear_num_value_heads,
+ hf_config.linear_key_head_dim,
+ hf_config.linear_value_head_dim,
+ hf_config.linear_conv_kernel_dim,
+ num_spec,
+ )
+
+ @classmethod
+ def get_mamba_state_copy_func(cls) -> tuple[MambaStateCopyFunc, MambaStateCopyFunc]:
+ return MambaStateCopyFuncCalculator.gated_delta_net_state_copy_func()
+
+
+########################################################
+# Qwen3_5-MoE
+########################################################
+
+
+class Qwen3_5_MoeMixtureOfExperts(MixtureOfExperts):
+ def update_physical_experts_metadata(
+ self,
+ num_physical_experts: int,
+ num_local_physical_experts: int,
+ ) -> None:
+ assert self.num_local_physical_experts == num_local_physical_experts
+ self.num_physical_experts = num_physical_experts
+ self.num_local_physical_experts = num_local_physical_experts
+ self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+ for layer in self.language_model.model.layers:
+ if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
+ moe = layer.mlp
+ moe.n_local_physical_experts = num_local_physical_experts
+ moe.n_physical_experts = num_physical_experts
+ moe.n_redundant_experts = self.num_redundant_experts
+ moe.experts.update_expert_map()
+
+ def set_moe_parameters(self):
+ self.expert_weights = []
+
+ self.moe_layers = []
+ example_moe = None
+ for layer in self.language_model.model.layers:
+ if isinstance(layer, Qwen3_5DecoderLayer) and isinstance(
+ layer.mlp, Qwen3NextSparseMoeBlock
+ ):
+ example_moe = layer.mlp
+ self.moe_layers.append(layer.mlp.experts)
+
+ if example_moe is None:
+ raise RuntimeError(
+ "No Qwen3_5 layer found in the language_model.model.layers."
+ )
+
+ # Set MoE hyperparameters
+ self.num_moe_layers = len(self.moe_layers)
+ self.num_expert_groups = 1
+ self.num_shared_experts = 0
+ self.num_logical_experts = example_moe.n_logical_experts
+ self.num_physical_experts = example_moe.n_physical_experts
+ self.num_local_physical_experts = example_moe.n_local_physical_experts
+ self.num_routed_experts = example_moe.n_routed_experts
+ self.num_redundant_experts = example_moe.n_redundant_experts
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+ Qwen3VLMultiModalProcessor,
+ info=Qwen3_5MoeProcessingInfo,
+ dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
+class Qwen3_5MoeForConditionalGeneration(
+ Qwen3_5ForConditionalGeneration, Qwen3_5_MoeMixtureOfExperts
+):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ # protocols have not __init__ method, so we need to use nn.Module.__init__
+ nn.Module.__init__(self)
+ config: Qwen3_5MoeConfig = vllm_config.model_config.hf_config
+ quant_config = vllm_config.quant_config
+ multimodal_config = vllm_config.model_config.multimodal_config
+
+ self.config = config
+ self.multimodal_config = multimodal_config
+ self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+ self.video_pruning_rate = multimodal_config.video_pruning_rate
+ self.is_multimodal_pruning_enabled = (
+ multimodal_config.is_multimodal_pruning_enabled()
+ )
+
+ with self._mark_tower_model(vllm_config, {"image", "video"}):
+ self.visual = Qwen3_VisionTransformer(
+ config.vision_config,
+ norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+ quant_config=quant_config,
+ prefix=maybe_prefix(prefix, "visual"),
+ )
+
+ with self._mark_language_model(vllm_config):
+ self.language_model = Qwen3_5MoeForCausalLM(
+ vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
+ )
+
+ self.make_empty_intermediate_tensors = (
+ self.language_model.make_empty_intermediate_tensors
+ )
+
+ # set MoE hyperparameters
+ self.set_moe_parameters()
diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py
new file mode 100644
index 000000000000..8bd29f352dbf
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_5_mtp.py
@@ -0,0 +1,447 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Qwen3_5 MTP model."""
+
+import typing
+from collections.abc import Callable, Iterable
+
+import torch
+from torch import nn
+from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import (
+ Qwen3_5MoeTextConfig,
+)
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+ ParallelLMHead,
+ VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.qwen3_5 import Qwen3_5DecoderLayer, Qwen3_5RMSNorm
+from vllm.model_executor.models.qwen3_next import QwenNextMixtureOfExperts
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (
+ MultiModalEmbeddings,
+ SupportsMultiModal,
+ _require_is_multimodal,
+)
+from .utils import (
+ AutoWeightsLoader,
+ PPMissingLayer,
+ _merge_multimodal_embeddings,
+ is_pp_missing_parameter,
+ make_empty_intermediate_tensors_factory,
+ maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+@support_torch_compile(
+ dynamic_arg_dims={
+ "input_ids": 0,
+ # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+ # otherwise (seq_len, ).
+ "positions": -1,
+ "intermediate_tensors": 0,
+ "inputs_embeds": 0,
+ "hidden_states": 0,
+ }
+)
+class Qwen3_5MultiTokenPredictor(nn.Module):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ super().__init__()
+
+ model_config = vllm_config.model_config
+ quant_config = vllm_config.quant_config
+
+ config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig = model_config.hf_text_config
+
+ self.config = config
+
+ self.vocab_size = config.vocab_size
+
+ self.mtp_start_layer_idx = config.num_hidden_layers
+ self.num_mtp_layers = getattr(config, "mtp_num_hidden_layers", 1)
+
+ self.embed_tokens = VocabParallelEmbedding(
+ self.vocab_size,
+ config.hidden_size,
+ )
+
+ self.fc = ColumnParallelLinear(
+ self.config.hidden_size * 2,
+ self.config.hidden_size,
+ gather_output=True,
+ bias=False,
+ return_bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.fc",
+ )
+
+ self.layers = torch.nn.ModuleList(
+ Qwen3_5DecoderLayer(
+ vllm_config,
+ layer_type="full_attention",
+ prefix=f"{prefix}.layers.{idx}",
+ )
+ for idx in range(self.num_mtp_layers)
+ )
+
+ self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+ ["hidden_states", "residual"], config.hidden_size
+ )
+
+ self.norm = Qwen3_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.pre_fc_norm_hidden = Qwen3_5RMSNorm(
+ config.hidden_size, eps=config.rms_norm_eps
+ )
+ self.pre_fc_norm_embedding = Qwen3_5RMSNorm(
+ config.hidden_size, eps=config.rms_norm_eps
+ )
+
+ def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+ return self.embed_tokens(input_ids)
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ intermediate_tensors: IntermediateTensors | None = None,
+ inputs_embeds: torch.Tensor | None = None,
+ spec_step_idx: int = 0,
+ ) -> torch.Tensor:
+ if get_pp_group().is_first_rank:
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_input_ids(input_ids)
+ assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
+ inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
+ hidden_states = self.pre_fc_norm_hidden(hidden_states)
+ hidden_states = torch.cat([inputs_embeds, hidden_states], dim=-1)
+ hidden_states = self.fc(hidden_states)
+ residual = None
+ else:
+ assert intermediate_tensors is not None
+ hidden_states = intermediate_tensors["hidden_states"]
+ residual = intermediate_tensors["residual"]
+
+ current_step_idx = spec_step_idx % self.num_mtp_layers
+ hidden_states, residual = self.layers[current_step_idx](
+ positions=positions,
+ hidden_states=hidden_states,
+ residual=residual,
+ )
+
+ if not get_pp_group().is_last_rank:
+ return IntermediateTensors(
+ {"hidden_states": hidden_states, "residual": residual}
+ )
+
+ hidden_states, _ = self.norm(hidden_states, residual)
+ return hidden_states
+
+ def load_fused_expert_weights(
+ self,
+ name: str,
+ params_dict: dict,
+ loaded_weight: torch.Tensor,
+ shard_id: str,
+ num_experts: int,
+ ) -> bool:
+ param = params_dict[name]
+ weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+ loaded_local_expert = False
+ for expert_id in range(num_experts):
+ curr_expert_weight = loaded_weight[expert_id]
+ success = weight_loader(
+ param,
+ curr_expert_weight,
+ name,
+ shard_id,
+ expert_id,
+ return_success=True,
+ )
+ if success:
+ loaded_local_expert = True
+
+ return loaded_local_expert
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ self,
+ ckpt_gate_proj_name="gate_proj",
+ ckpt_down_proj_name="down_proj",
+ ckpt_up_proj_name="up_proj",
+ num_experts=self.config.num_experts
+ if hasattr(self.config, "num_experts")
+ else 0,
+ )
+
+ params_dict = dict(self.named_parameters())
+ loaded_params: set[str] = set()
+ is_fused_expert = False
+ fused_expert_params_mapping = [
+ ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+ ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+ ]
+ num_experts = (
+ self.config.num_experts if hasattr(self.config, "num_experts") else 0
+ )
+ for name, loaded_weight in weights:
+ if "rotary_emb.inv_freq" in name:
+ continue
+
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+ is_fused_expert = True
+ expert_params_mapping = fused_expert_params_mapping
+
+ if weight_name not in name:
+ continue
+
+ if "mlp.experts" in name:
+ continue
+
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ # Skip layers on other devices.
+ if is_pp_missing_parameter(name, self):
+ continue
+ if name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, loaded_weight, shard_id)
+ break
+ else:
+ is_expert_weight = False
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in name:
+ continue
+ is_expert_weight = True
+ name_mapped = name.replace(weight_name, param_name)
+ # Skip layers on other devices.
+ if is_pp_missing_parameter(name_mapped, self):
+ continue
+ if is_fused_expert:
+ # qwen3.5 no need to transpose
+ # loaded_weight = loaded_weight.transpose(-1, -2)
+ if "experts.gate_up_proj" in name:
+ loaded_weight = loaded_weight.chunk(2, dim=-2)
+ success_w1 = self.load_fused_expert_weights(
+ name_mapped,
+ params_dict,
+ loaded_weight[0],
+ "w1",
+ num_experts,
+ )
+ success_w3 = self.load_fused_expert_weights(
+ name_mapped,
+ params_dict,
+ loaded_weight[1],
+ "w3",
+ num_experts,
+ )
+ success = success_w1 and success_w3
+ else:
+ # down_proj
+ success = self.load_fused_expert_weights(
+ name_mapped,
+ params_dict,
+ loaded_weight,
+ shard_id,
+ num_experts,
+ )
+ if success:
+ name = name_mapped
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if (
+ name_mapped.endswith(".bias")
+ or name_mapped.endswith("_bias")
+ ) and name_mapped not in params_dict:
+ continue
+ param = params_dict[name_mapped]
+ weight_loader = param.weight_loader
+ success = weight_loader(
+ param,
+ loaded_weight,
+ name_mapped,
+ shard_id=shard_id,
+ expert_id=expert_id,
+ return_success=True,
+ )
+ if success:
+ name = name_mapped
+ break
+ else:
+ if is_expert_weight:
+ # We've checked that this is an expert weight
+ # However it's not mapped locally to this rank
+ # So we simply skip it
+ continue
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ if is_pp_missing_parameter(name, self):
+ continue
+ if name not in params_dict:
+ logger.warning_once(
+ f"Parameter {name} not found in params_dict, skip loading"
+ )
+ continue
+ param = params_dict[name]
+ weight_loader = getattr(
+ param, "weight_loader", default_weight_loader
+ )
+ weight_loader(param, loaded_weight)
+ loaded_params.add(name)
+ return loaded_params
+
+
+@support_torch_compile(
+ dynamic_arg_dims={
+ "input_ids": 0,
+ # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+ # otherwise (seq_len, ).
+ "positions": -1,
+ "intermediate_tensors": 0,
+ "inputs_embeds": 0,
+ "hidden_states": 0,
+ }
+)
+class Qwen3_5MTP(nn.Module, SupportsMultiModal):
+ packed_modules_mapping = {
+ "qkv_proj": [
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ ],
+ "gate_up_proj": ["up_proj", "down_proj"],
+ }
+
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ config = vllm_config.model_config.hf_text_config
+ self.vllm_config = vllm_config
+ cache_config = vllm_config.cache_config
+ if cache_config.mamba_cache_mode == "all":
+ raise NotImplementedError(
+ "Qwen3_5MTP currently does not support 'all' prefix caching, "
+ "please use '--mamba-cache-mode=align' instead"
+ )
+
+ self.quant_config = vllm_config.quant_config
+
+ super().__init__()
+ self.config = config
+ self.model = Qwen3_5MultiTokenPredictor(
+ vllm_config=vllm_config, prefix=maybe_prefix(prefix, "mtp")
+ )
+
+ if get_pp_group().is_last_rank:
+ if config.tie_word_embeddings:
+ self.lm_head = self.model.embed_tokens
+ else:
+ self.lm_head = ParallelLMHead(
+ config.vocab_size,
+ config.hidden_size,
+ prefix=maybe_prefix(prefix, "lm_head"),
+ )
+ else:
+ self.lm_head = PPMissingLayer()
+
+ self.logits_processor = LogitsProcessor(config.vocab_size)
+
+ def embed_input_ids(
+ self,
+ input_ids: torch.Tensor,
+ multimodal_embeddings: MultiModalEmbeddings | None = None,
+ *,
+ is_multimodal: torch.Tensor | None = None,
+ handle_oov_mm_token: bool = False,
+ ) -> torch.Tensor:
+ inputs_embeds = self._embed_text_input_ids(
+ input_ids,
+ self.model.embed_input_ids,
+ is_multimodal=is_multimodal,
+ handle_oov_mm_token=handle_oov_mm_token,
+ )
+
+ if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+ return inputs_embeds
+
+ is_multimodal = _require_is_multimodal(is_multimodal)
+
+ inputs_embeds = _merge_multimodal_embeddings(
+ inputs_embeds=inputs_embeds,
+ multimodal_embeddings=multimodal_embeddings,
+ is_multimodal=is_multimodal,
+ )
+
+ return inputs_embeds
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ intermediate_tensors: IntermediateTensors | None = None,
+ inputs_embeds: torch.Tensor | None = None,
+ **kwargs: object,
+ ):
+ hidden_states = self.model(
+ input_ids, positions, hidden_states, intermediate_tensors, inputs_embeds
+ )
+ return hidden_states
+
+ def compute_logits(
+ self,
+ hidden_states: torch.Tensor,
+ spec_step_idx: int = 0,
+ ) -> torch.Tensor | None:
+ return self.logits_processor(self.lm_head, hidden_states)
+
+ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ def remap_weight_names(weights):
+ for name, weight in weights:
+ if name.startswith("mtp."):
+ name = name.replace("mtp.", "model.")
+ elif any(key in name for key in ["embed_tokens", "lm_head"]):
+ if "embed_tokens" in name:
+ name = name.replace("language_model.", "")
+ else:
+ continue
+ yield name, weight
+
+ loader = AutoWeightsLoader(self)
+ return loader.load_weights(remap_weight_names(weights))
+
+
+class Qwen3_5MoeMTP(Qwen3_5MTP, QwenNextMixtureOfExperts):
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+ super().__init__(vllm_config=vllm_config, prefix=prefix)
+ self.set_moe_parameters()
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 503b40702070..3bcfbacbb381 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -105,7 +105,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
- config = vllm_config.model_config.hf_config
+ config = vllm_config.model_config.hf_text_config
parallel_config = vllm_config.parallel_config
quant_config = vllm_config.quant_config
@@ -176,7 +176,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
hidden_size=config.hidden_size,
intermediate_size=config.moe_intermediate_size,
reduce_results=False,
- renormalize=config.norm_topk_prob,
+ renormalize=getattr(config, "norm_topk_prob", True),
quant_config=quant_config,
prefix=f"{prefix}.experts",
enable_eplb=self.enable_eplb,
@@ -965,7 +965,7 @@ class Qwen3NextModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
- config: Qwen3NextConfig = vllm_config.model_config.hf_config
+ config: Qwen3NextConfig = vllm_config.model_config.hf_text_config
parallel_config = vllm_config.parallel_config
eplb_config = parallel_config.eplb_config
@@ -1042,7 +1042,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj",
- num_experts=self.config.num_experts,
+ num_experts=getattr(self.config, "num_experts", 0),
num_redundant_experts=self.num_redundant_experts,
)
@@ -1201,7 +1201,7 @@ class Qwen3NextForCausalLM(
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
- config = vllm_config.model_config.hf_config
+ config = vllm_config.model_config.hf_text_config
self.vllm_config = vllm_config
self.model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
@@ -1265,7 +1265,7 @@ def get_mamba_state_shape_from_config(
cls, vllm_config: "VllmConfig"
) -> tuple[tuple[int, int], tuple[int, int]]:
parallel_config = vllm_config.parallel_config
- hf_config = vllm_config.model_config.hf_config
+ hf_config = vllm_config.model_config.hf_text_config
tp_size = parallel_config.tensor_parallel_size
num_spec = (
vllm_config.speculative_config.num_speculative_tokens
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 6e68b24ba4d1..1871591c9eca 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -466,6 +466,14 @@
"qwen3_vl_moe",
"Qwen3VLMoeForConditionalGeneration",
),
+ "Qwen3_5ForConditionalGeneration": (
+ "qwen3_5",
+ "Qwen3_5ForConditionalGeneration",
+ ),
+ "Qwen3_5MoeForConditionalGeneration": (
+ "qwen3_5",
+ "Qwen3_5MoeForConditionalGeneration",
+ ),
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
@@ -509,6 +517,8 @@
"OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"),
"Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"),
"Step3p5MTP": ("step3p5_mtp", "Step3p5MTP"),
+ "Qwen3_5MTP": ("qwen3_5_mtp", "Qwen3_5MTP"),
+ "Qwen3_5MoeMTP": ("qwen3_5_mtp", "Qwen3_5MoeMTP"),
# Temporarily disabled.
# # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index f82186639299..5fc737e8ee90 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -420,6 +420,11 @@ def get_num_hidden_layers(self) -> int:
return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
+class Qwen3_5MTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+ def get_num_hidden_layers(self) -> int:
+ return getattr(self.hf_text_config, "mtp_num_hidden_layers", 0)
+
+
class PanguUltraMoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
def get_num_hidden_layers(self) -> int:
return getattr(self.hf_text_config, "num_nextn_predict_layers", 0)
@@ -445,6 +450,7 @@ def get_num_hidden_layers(self) -> int:
"nemotron-nas": NemotronNasModelArchConfigConvertor,
"deepseek_mtp": DeepSeekMTPModelArchConfigConvertor,
"qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor,
+ "qwen3_5_mtp": Qwen3_5MTPModelArchConfigConvertor,
"mimo_mtp": MimoMTPModelArchConfigConvertor,
"glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor,
"glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index d4b38d67021a..d29ee00fa1dc 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -1356,6 +1356,8 @@ def load_model(self, target_model: nn.Module) -> None:
"Qwen3VLMoeForConditionalGeneration",
"HunYuanVLForConditionalGeneration",
"GlmOcrForConditionalGeneration",
+ "Qwen3_5ForConditionalGeneration",
+ "Qwen3_5MoeForConditionalGeneration",
]:
self.model.config.image_token_index = target_model.config.image_token_id
elif self.get_model_name(target_model) == "PixtralForConditionalGeneration":