From a62c2df7715364aa36b8895bc3324b77dc83ffae Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:19:29 +0100 Subject: [PATCH 01/70] Rename `rope_scaling` -> `rope_parameters` in `get_rope` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../layers/rotary_embedding/__init__.py | 73 ++++++++++--------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 56c165f9c041..229598c17720 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -28,21 +28,22 @@ def get_rope( max_position: int, base: float, is_neox_style: bool = True, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() - if rope_scaling is not None: + if rope_parameters is not None: # Transforms every value that is a list into a tuple for caching calls - rope_scaling_tuple = { - k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items() + rope_parameters_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in rope_parameters.items() } - rope_scaling_args = tuple(rope_scaling_tuple.items()) + rope_parameters_args = tuple(rope_parameters_tuple.items()) else: - rope_scaling_args = None + rope_parameters_args = None if dual_chunk_attention_config is not None: dual_chunk_attention_tuple = { @@ -62,7 +63,7 @@ def get_rope( max_position, base, is_neox_style, - rope_scaling_args, + rope_parameters_args, dual_chunk_attention_args, dtype, ) @@ -84,18 +85,18 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_scaling: + elif not rope_parameters: rotary_emb = RotaryEmbedding( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) else: - scaling_type = rope_scaling["rope_type"] + scaling_type = rope_parameters["rope_type"] if scaling_type == "llama3": - scaling_factor = rope_scaling["factor"] - low_freq_factor = rope_scaling["low_freq_factor"] - high_freq_factor = rope_scaling["high_freq_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] rotary_emb = Llama3RotaryEmbedding( head_size, rotary_dim, @@ -113,7 +114,7 @@ def get_rope( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) elif scaling_type == "default": - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: rotary_emb = MRotaryEmbedding( head_size, rotary_dim, @@ -121,8 +122,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), ) else: rotary_emb = RotaryEmbedding( @@ -134,7 +135,7 @@ def get_rope( dtype, ) elif scaling_type == "linear": - scaling_factor = rope_scaling["factor"] + scaling_factor = rope_parameters["factor"] rotary_emb = LinearScalingRotaryEmbedding( head_size, rotary_dim, @@ -145,8 +146,8 @@ def get_rope( dtype, ) elif scaling_type == "ntk": - scaling_factor = rope_scaling["factor"] - mixed_b = rope_scaling.get("mixed_b", None) + scaling_factor = rope_parameters["factor"] + mixed_b = rope_parameters.get("mixed_b", None) rotary_emb = NTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -158,8 +159,8 @@ def get_rope( mixed_b, ) elif scaling_type == "dynamic": - if "alpha" in rope_scaling: - scaling_alpha = rope_scaling["alpha"] + if "alpha" in rope_parameters: + scaling_alpha = rope_parameters["alpha"] rotary_emb = DynamicNTKAlphaRotaryEmbedding( head_size, rotary_dim, @@ -169,8 +170,8 @@ def get_rope( scaling_alpha, dtype, ) - elif "factor" in rope_scaling: - scaling_factor = rope_scaling["factor"] + elif "factor" in rope_parameters: + scaling_factor = rope_parameters["factor"] rotary_emb = DynamicNTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -185,11 +186,11 @@ def get_rope( "Dynamic rope scaling must contain either 'alpha' or 'factor' field" ) elif scaling_type == "yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -199,7 +200,7 @@ def get_rope( "apply_yarn_scaling", ) } - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: extra_kwargs.pop("apply_yarn_scaling", None) rotary_emb = MRotaryEmbedding( head_size, @@ -208,8 +209,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), scaling_factor=scaling_factor, **extra_kwargs, ) @@ -225,12 +226,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "deepseek_yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] # assert max_position == original_max_position * scaling_factor extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -252,12 +253,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "longrope": - short_factor = rope_scaling["short_factor"] - long_factor = rope_scaling["long_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + short_factor = rope_parameters["short_factor"] + long_factor = rope_parameters["long_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ("short_mscale", "long_mscale") } rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( From f42b03d5b95e7b562915e8826e5ac02309f00afd Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:25:58 +0100 Subject: [PATCH 02/70] Patch rope parameters to new name, `rope_parameters` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 47 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 14cae2b168e1..2873b615fd69 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -25,6 +25,7 @@ RevisionNotFoundError, ) from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -34,6 +35,7 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs +from vllm.config.utils import getattr_iter from vllm.logger import init_logger from vllm.transformers_utils.config_parser_base import ConfigParserBase from vllm.transformers_utils.utils import ( @@ -389,32 +391,29 @@ def file_or_path_exists( ) -def patch_rope_scaling(config: PretrainedConfig) -> None: +def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - text_config = getattr(config, "text_config", None) - if text_config is not None: - patch_rope_scaling(text_config) - - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None: - patch_rope_scaling_dict(rope_scaling) - - -def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: - if "rope_type" in rope_scaling and "type" in rope_scaling: - rope_type = rope_scaling["rope_type"] - rope_type_legacy = rope_scaling["type"] - if rope_type != rope_type_legacy: - raise ValueError( - f"Found conflicts between 'rope_type={rope_type}' (modern " - f"field) and 'type={rope_type_legacy}' (legacy field). " - "You should only specify one of them." - ) + text_config = config.get_text_config() + # (Transformers v5, Transformers v4) + rope_parameters_keys = ("rope_parameters", "rope_scaling") + rope_parameters = getattr_iter(text_config, rope_parameters_keys, None) + + if rope_parameters is not None: + # Forward compatibility for Transformers v5 + # (can be removed once Transformers v4 is no longer supported) + cls_attr = getattr(type(text_config), "rope_scaling", None) + if not isinstance(cls_attr, property): + text_config.rope_parameters = rope_parameters + delattr(text_config, "rope_scaling") + # Handle nested rope_parameters in interleaved sliding attention models + if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + for rope_parameters_layer_type in rope_parameters.values(): + patch_rope_parameters_dict(rope_parameters_layer_type) + else: + patch_rope_parameters_dict(rope_parameters) - if "rope_type" not in rope_scaling and "type" in rope_scaling: - rope_scaling["rope_type"] = rope_scaling["type"] - logger.info("Replacing legacy 'type' key with 'rope_type'") +def patch_rope_parameters_dict(rope_scaling: dict[str, Any]) -> None: if "rope_type" not in rope_scaling: raise ValueError("rope_scaling should have a 'rope_type' key") @@ -679,7 +678,7 @@ def get_config( logger.debug("Overriding HF config with %s", hf_overrides_fn) config = hf_overrides_fn(config) - patch_rope_scaling(config) + patch_rope_parameters(config) if trust_remote_code: maybe_register_config_serialize_by_value() From a2a94374d7c682bdbc488de8df382215371996ab Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:58:25 +0100 Subject: [PATCH 03/70] Update models where it's a simple rename Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/arcee.py | 8 ++++---- vllm/model_executor/models/bailing_moe.py | 2 +- vllm/model_executor/models/bamba.py | 4 ++-- vllm/model_executor/models/chameleon.py | 20 +++++++++---------- vllm/model_executor/models/dots1.py | 8 ++++---- vllm/model_executor/models/ernie45_moe.py | 8 ++++---- vllm/model_executor/models/ernie45_vl_moe.py | 6 +++--- vllm/model_executor/models/exaone.py | 16 +++++++-------- vllm/model_executor/models/falcon_h1.py | 4 ++-- vllm/model_executor/models/glm4.py | 8 ++++---- vllm/model_executor/models/glm4_moe.py | 8 ++++---- vllm/model_executor/models/gpt_oss.py | 10 +++++----- vllm/model_executor/models/granite.py | 12 +++++------ vllm/model_executor/models/granitemoe.py | 8 ++++---- .../model_executor/models/granitemoeshared.py | 4 ++-- vllm/model_executor/models/hunyuan_v1.py | 18 ++++++++--------- vllm/model_executor/models/internlm2.py | 8 ++++---- vllm/model_executor/models/internlm2_ve.py | 4 ++-- vllm/model_executor/models/kimi_linear.py | 4 ++-- vllm/model_executor/models/llama4.py | 8 ++++---- vllm/model_executor/models/longcat_flash.py | 12 +++++------ vllm/model_executor/models/minicpm.py | 8 ++++---- vllm/model_executor/models/minicpm3.py | 6 +++--- vllm/model_executor/models/minicpm_eagle.py | 4 ++-- vllm/model_executor/models/minimax_m2.py | 8 ++++---- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/nemotron.py | 12 +++++------ vllm/model_executor/models/nemotron_nas.py | 16 +++++++-------- vllm/model_executor/models/olmoe.py | 4 ++-- vllm/model_executor/models/orion.py | 8 ++++---- vllm/model_executor/models/ouro.py | 8 ++++---- vllm/model_executor/models/phimoe.py | 8 ++++---- vllm/model_executor/models/plamo2.py | 6 ++---- vllm/model_executor/models/qwen.py | 8 ++++---- vllm/model_executor/models/qwen2.py | 8 ++++---- vllm/model_executor/models/qwen2_moe.py | 8 ++++---- vllm/model_executor/models/qwen3.py | 8 ++++---- vllm/model_executor/models/qwen3_moe.py | 8 ++++---- vllm/model_executor/models/seed_oss.py | 8 ++++---- vllm/model_executor/models/solar.py | 12 +++++------ vllm/model_executor/models/step3_text.py | 8 ++++---- vllm/model_executor/models/zamba2.py | 2 +- 42 files changed, 169 insertions(+), 171 deletions(-) diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index f33970aff279..a6820fb8c5fc 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -105,11 +105,11 @@ def __init__( self.hidden_size = config.hidden_size # Rotary embedding parameters (reuse LLaMA defaults) rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -134,7 +134,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 641bdb69c366..ab8e9ea5d8a4 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -138,7 +138,7 @@ def __init__( max_position=config.max_position_embeddings, base=config.rope_theta, is_neox_style=True, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 4a2b3da1c194..93ce58067196 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -158,7 +158,7 @@ def __init__( ) -> None: super().__init__() rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -193,7 +193,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, base=rope_theta, is_neox_style=True, dtype=torch.get_default_dtype(), # see impl of get_rope diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 64f73e938bf6..0285cde9ea0d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -265,7 +265,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -318,7 +318,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -369,11 +369,11 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) @@ -385,7 +385,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, @@ -439,11 +439,11 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) @@ -455,7 +455,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 15caa3184581..1599c4f020fa 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -203,7 +203,7 @@ def __init__( num_kv_heads: int, config: Dots1Config, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -256,7 +256,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -297,7 +297,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) self.layer_idx = layer_idx @@ -308,7 +308,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, config=config, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index b35666175ea7..0e8284a3785c 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -234,7 +234,7 @@ def __init__( num_kv_heads: int, head_dim: int | None = None, rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, @@ -293,7 +293,7 @@ def __init__( max_position=max_position_embeddings, base=rope_theta, is_neox_style=False, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -334,7 +334,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) self.self_attn = Ernie4_5_MoeAttention( hidden_size=self.hidden_size, @@ -342,7 +342,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index d002d1838c8e..2fd0522832a4 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -93,7 +93,7 @@ def __init__( num_kv_heads: int, head_dim: int | None = None, rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, freq_allocation: int = 20, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, @@ -414,7 +414,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) @@ -424,7 +424,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, freq_allocation=freq_allocation, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index b9c7a520caff..fb7dfbb535d1 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -114,7 +114,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -174,7 +174,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -208,7 +208,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -222,7 +222,7 @@ def __init__( num_heads=num_heads, num_kv_heads=num_kv_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=bias, @@ -252,11 +252,11 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -273,7 +273,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 38838be29093..71b4add07770 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -217,7 +217,7 @@ def __init__( ) -> None: super().__init__() rope_theta = getattr(config, "rope_theta", 1e11) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -256,7 +256,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, base=rope_theta, is_neox_style=True, dtype=None, # see impl of get_rope diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 4172f16737c1..12b8b6349436 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -60,7 +60,7 @@ def __init__( rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, + rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -108,7 +108,7 @@ def __init__( rotary_dim=self.rotary_dim, max_position=max_position, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, partial_rotary_factor=partial_rotary_factor, is_neox_style=False, ) @@ -151,7 +151,7 @@ def __init__( self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = Glm4Attention( config=config, @@ -164,7 +164,7 @@ def __init__( head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index b30bd66161da..afa37a71478a 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -234,7 +234,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -292,7 +292,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( @@ -342,7 +342,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. @@ -355,7 +355,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 291ac833f26a..e41447470b94 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -69,14 +69,14 @@ def __init__( max_position=config.max_position_embeddings, base=config.rope_theta, dtype=torch.float32, - rope_scaling={ + rope_parameters={ "rope_type": "yarn", - "factor": config.rope_scaling["factor"], - "original_max_position_embeddings": config.rope_scaling[ + "factor": config.rope_parameters["factor"], + "original_max_position_embeddings": config.rope_parameters[ "original_max_position_embeddings" ], - "beta_fast": config.rope_scaling["beta_fast"], - "beta_slow": config.rope_scaling["beta_slow"], + "beta_fast": config.rope_parameters["beta_fast"], + "beta_slow": config.rope_parameters["beta_slow"], }, is_neox_style=True, ) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 5fc8718ca75e..c294ac7b91df 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -113,7 +113,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -168,7 +168,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -205,11 +205,11 @@ def __init__( self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -226,7 +226,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index c5b36c362ee3..b316fd317475 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -142,7 +142,7 @@ def __init__( num_kv_heads: int, max_position: int = 4096 * 32, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -196,7 +196,7 @@ def __init__( max_position=max_position, base=int(self.rope_theta), is_neox_style=True, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -237,14 +237,14 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index e08e9f73ec87..9fe7b6816e48 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -86,14 +86,14 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index a05a00932c13..3f9f9a3295a8 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -143,7 +143,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -205,7 +205,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -255,7 +255,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -315,7 +315,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -495,11 +495,11 @@ def __init__( else config.intermediate_size[layer_id] ) rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -521,7 +521,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, @@ -538,7 +538,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d856f5c79e33..602dff88551f 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -92,7 +92,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -145,7 +145,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -205,14 +205,14 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6dc081e34157..d9f0c0019637 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -31,14 +31,14 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index cce22842d333..75d1d9ce90f5 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -192,7 +192,7 @@ def __init__( kv_lora_rank: int, rope_theta: float = 10000, use_nope: bool = False, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -214,7 +214,7 @@ def __init__( self.use_nope = use_nope assert self.use_nope is True assert self.q_lora_rank is None - assert rope_scaling is None + assert rope_parameters is None assert num_heads % tp_size == 0 self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a7e0732ec71e..dae85328d434 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -172,7 +172,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -249,7 +249,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=int(rope_theta), - rope_scaling=rope_scaling if rope_scaling != "default" else None, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) if not self.nope @@ -332,7 +332,7 @@ def __init__( self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size rope_theta = config.rope_theta - rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters max_position_embeddings = config.max_position_embeddings self.self_attn = Llama4Attention( @@ -341,7 +341,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index b848ae6e822f..6440086bbe65 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -109,7 +109,7 @@ def __init__( pretraining_tp=1, tie_word_embeddings=False, rope_theta=1000000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, mla_scale_q_lora=False, @@ -163,7 +163,7 @@ def __init__( self.pretraining_tp = pretraining_tp self.use_cache = use_cache self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mla_scale_q_lora = mla_scale_q_lora @@ -337,12 +337,12 @@ def __init__( self.layer_idx = int(prefix.split(sep=".")[-1]) self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if rope_scaling is not None and getattr( + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) @@ -362,7 +362,7 @@ def __init__( ), kv_lora_rank=config.kv_lora_rank, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=None diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index d9f0b477180e..d2e8ce53f31d 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -231,7 +231,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -282,7 +282,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -325,7 +325,7 @@ def __init__( self.quant_config = quant_config self.hidden_size = config.hidden_size self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) + self.rope_parameters = getattr(config, "rope_parameters", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -340,7 +340,7 @@ def _init_attn_block(self): num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index d3b6966ee3a7..b6b998dbc58c 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -63,7 +63,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -128,7 +128,7 @@ def __init__( rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_local_heads, @@ -205,7 +205,7 @@ def _init_attn_block(self): q_lora_rank=self.config.q_lora_rank, kv_lora_rank=self.config.kv_lora_rank, rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 6efc61e25ea1..e2d592c97bb8 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -69,7 +69,7 @@ def __init__( self.quant_config = quant_config self.hidden_size = config.hidden_size self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) + self.rope_parameters = getattr(config, "rope_parameters", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -84,7 +84,7 @@ def _init_attn_block(self): num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 21ed428a05d0..9c8c807c6618 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -150,7 +150,7 @@ def __init__( num_kv_heads: int, rotary_dim: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, @@ -206,7 +206,7 @@ def __init__( rotary_dim=rotary_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -253,7 +253,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = max( @@ -270,7 +270,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, rotary_dim=config.rotary_dim, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 4548abde77d5..3fbb1839bbfb 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -292,7 +292,7 @@ def __init__( # number of image patches max_position=(config.image_size // config.patch_size) ** 2, base=config.rope_theta, - rope_scaling={"rope_type": "mllama4"}, + rope_parameters={"rope_type": "mllama4"}, is_neox_style=False, dtype=torch.complex64, # important ) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 17e8e7f28258..f986f42a2a01 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -151,7 +151,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -207,7 +207,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( @@ -244,11 +244,11 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -265,7 +265,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index acd0d0c98234..72b90632a7fb 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -83,7 +83,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -98,7 +98,7 @@ def __init__( num_heads, num_kv_heads, rope_theta, - rope_scaling, + rope_parameters, max_position_embeddings, quant_config, bias, @@ -111,7 +111,7 @@ def __init__( def _init_rotary_emb( self, config, - rope_scaling: dict[str, Any] | None, + rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: # Enables YARN for Mistral and LLaMA4 derivatives. @@ -127,7 +127,7 @@ def _init_rotary_emb( rotary_dim=self.head_dim, max_position=self.max_position_embeddings, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -149,11 +149,11 @@ def __init__( self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -177,7 +177,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=num_kv_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 35a09334a129..8ec7a8a913d6 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -124,7 +124,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) num_heads = config.num_attention_heads @@ -177,7 +177,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index cbfce18b4388..c8d71b7d2056 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -89,7 +89,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -140,7 +140,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -176,14 +176,14 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index cc7947df50ae..5704725779ed 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -115,7 +115,7 @@ def __init__( rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, + rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -171,7 +171,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = nn.ModuleList() @@ -228,7 +228,7 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -247,7 +247,7 @@ def __init__( rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 97e553787790..321fd9509256 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -307,7 +307,7 @@ def __init__( rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: dict | None = None, + rope_parameters: dict | None = None, prefix: str = "", ) -> None: super().__init__() @@ -333,7 +333,7 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + self.rope_parameters = rope_parameters self.qkv_proj = QKVParallelLinear( hidden_size, @@ -357,7 +357,7 @@ def __init__( max_position=max_position, base=int(self.rope_theta), is_neox_style=True, - rope_scaling=self.rope_scaling, + rope_parameters=self.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -405,7 +405,7 @@ def __init__( rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) self.block_sparse_moe = PhiMoE( diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index ece1c5ec23cf..2d59af4258f8 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -577,9 +577,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No ) self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000 - self.rope_scaling = ( - config.rope_scaling if hasattr(config, "rope_scaling") else None - ) + self.rope_parameters = getattr(config, "rope_parameters", None) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -591,7 +589,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No rotary_dim=self.head_dim, max_position=max_position, base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.rope_parameters, ) self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps) self.q_norm.weight = torch.nn.Parameter( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index c99f628004fb..f00be75a6f12 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -84,7 +84,7 @@ def __init__( num_heads: int, max_position_embeddings: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -118,7 +118,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -154,13 +154,13 @@ def __init__( self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, config.max_position_embeddings, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index cdf32c6c5137..1bd0c831aaab 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -118,7 +118,7 @@ def __init__( rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, + rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -168,7 +168,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) attn_cls = ( @@ -218,7 +218,7 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -240,7 +240,7 @@ def __init__( rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index c03bd6a3c6d7..3f6b3e6323d5 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -195,7 +195,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -249,7 +249,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -292,7 +292,7 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -302,7 +302,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index f689ff79d761..12f54243f441 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -64,7 +64,7 @@ def __init__( rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, + rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -114,7 +114,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -168,7 +168,7 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -193,7 +193,7 @@ def __init__( head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index d57b82cb0227..2fa6f937925a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -217,7 +217,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, rms_norm_eps: float = 1e-06, @@ -274,7 +274,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -327,7 +327,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None @@ -337,7 +337,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 04da19a440a1..5029c7876a4b 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -116,7 +116,7 @@ def __init__( rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, + rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -164,7 +164,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -202,7 +202,7 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) # By default, SeedOss uses causal attention as it is a # decoder-only model. @@ -222,7 +222,7 @@ def __init__( rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 5b8bf150edf6..31572f8505cf 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -112,7 +112,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -167,7 +167,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -203,12 +203,12 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) - if rope_scaling is not None and getattr( + if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None ): - rope_scaling["original_max_position_embeddings"] = ( + rope_parameters["original_max_position_embeddings"] = ( config.original_max_position_embeddings ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) @@ -225,7 +225,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 381b3f4932e5..4cd77edc52ee 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -146,7 +146,7 @@ def __init__( norm_eps: float, rope_theta: int, share_q_dim: int | None = None, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 8192, head_dim: int = 256, cache_config: CacheConfig | None = None, @@ -199,7 +199,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embedding, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) scaling = self.head_dim**-0.5 self.attn = Attention( @@ -235,7 +235,7 @@ def __init__( super().__init__() config = config.hf_config self.hidden_size = config.hidden_size - rope_scaling = getattr(config, "rope_scaling", None) + rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = Step3TextAttention( hidden_size=self.hidden_size, @@ -248,7 +248,7 @@ def __init__( head_dim=config.head_dim, share_q_dim=config.share_q_dim, rope_theta=config.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index bf3107525bc5..2deb62182423 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -234,7 +234,7 @@ def __init__( rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, base=self.rope_theta, - rope_scaling=None, + rope_parameters=None, is_neox_style=True, ) From fba5bf5ac92e230d304eb29586e9e5bbcc6bb592 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 11:03:27 +0100 Subject: [PATCH 04/70] Fix model config overrides Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 48 +++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 6ce91ebb87b9..082b998ce97c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -13,6 +13,7 @@ from pydantic import ConfigDict, SkipValidation, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig @@ -2060,30 +2061,33 @@ def _get_and_verify_max_len( ) derived_max_model_len = default_max_len - rope_scaling = getattr(hf_config, "rope_scaling", None) + rope_parameters = getattr(hf_config, "rope_parameters", None) # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # scaling, so we skip applying the scaling factor again. - if rope_scaling is not None and "gemma3" not in hf_config.model_type: - # No need to consider "type" key because of patch_rope_scaling when - # loading HF config - rope_type = rope_scaling["rope_type"] - - if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: - # TODO(robertgshaw): Find a model that supports rope_scaling - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "with rope_scaling. Please raise an issue so we can " - "investigate." - ) + if rope_parameters is not None and "gemma3" not in hf_config.model_type: + # In Transformers v5 this could be RopeParameters or dict[str, RopeParameters] + # To simplify, we convert any RopeParameters to dict[str, RopeParameters] + if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters = {"": rope_parameters} + for rp in rope_parameters.values(): + rope_type = rp["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + if disable_sliding_window: + # TODO(robertgshaw): Find a model that supports rope_parameters + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models with " + "rope_parameters. Please raise an issue so we can investigate." + ) - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rope_scaling.get("factor", 1.0) + # NOTE: rope_type == "default" does not define factor + # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + scaling_factor = rp.get("factor", 1.0) - if rope_type == "yarn": - derived_max_model_len = rope_scaling["original_max_position_embeddings"] + if rope_type == "yarn": + derived_max_model_len = rp["original_max_position_embeddings"] + # Do this outside loop since all layers should have the same scaling derived_max_model_len *= scaling_factor if encoder_config and "max_seq_length" in encoder_config: @@ -2094,7 +2098,9 @@ def _get_and_verify_max_len( if max_model_len is None: # For LongRoPE, default to original_max_position_embeddings to avoid # performance degradation for shorter sequences - if rope_scaling is not None and rope_scaling["rope_type"] == "longrope": + if rope_parameters is not None and any( + rp["rope_type"] == "longrope" for rp in rope_parameters.values() + ): max_model_len = int( getattr( hf_config, "original_max_position_embeddings", derived_max_model_len From ee5cf666662dc8b4b107beda64d56686f26baf73 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 11:04:30 +0100 Subject: [PATCH 05/70] Update examples Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- examples/offline_inference/context_extension.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index df39e4c25d5c..0c5d7c12fb19 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This script demonstrates how to extend the context length -of a Qwen model using the YARN method (rope_scaling) +of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. Usage: @@ -20,7 +20,7 @@ def create_llm(): # Use yarn to extend context hf_overrides = { "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, From 080530ddeeb8c4f827f984bb7f867fc3990823a2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 11:06:15 +0100 Subject: [PATCH 06/70] Update benchmarks Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index cb848d2bf579..b9036dad09d3 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,9 +6,9 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, -# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, -# speedup +# rope_theta, is_neox_style, rope_parameters, dtype, torch_mean, torch_median, +# torch_p99, torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, +# triton_max, speedup # # == Usage Examples == # @@ -88,7 +88,7 @@ def benchmark_mrope( max_position: int = 8192, rope_theta: float = 10000, is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, + rope_parameters: dict[str, Any] = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -104,7 +104,7 @@ def benchmark_mrope( max_position=max_position, base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -205,7 +205,7 @@ def benchmark_mrope( max_position, rope_theta, is_neox_style, - str(rope_scaling), + str(rope_parameters), str(dtype).split(".")[-1], torch_stats["mean"], torch_stats["median"], @@ -257,7 +257,7 @@ def benchmark_mrope( "max_position", "rope_theta", "is_neox_style", - "rope_scaling", + "rope_parameters", "dtype", "torch_mean", "torch_median", @@ -317,7 +317,7 @@ def benchmark_mrope( max_position=max_position, rope_theta=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, From 889b9002eae3bdcfd0b3c403cf1a5959fc2f3a36 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 11:12:34 +0100 Subject: [PATCH 07/70] More renaming in transformers utils Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2873b615fd69..6b2409f0f01a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -413,25 +413,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: patch_rope_parameters_dict(rope_parameters) -def patch_rope_parameters_dict(rope_scaling: dict[str, Any]) -> None: - if "rope_type" not in rope_scaling: - raise ValueError("rope_scaling should have a 'rope_type' key") +def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: + if "rope_type" not in rope_parameters: + raise ValueError("rope_parameters should have a 'rope_type' key") - if rope_scaling["rope_type"] == "su": - rope_scaling["rope_type"] = "longrope" + if rope_parameters["rope_type"] == "su": + rope_parameters["rope_type"] = "longrope" logger.warning("Replacing legacy rope_type 'su' with 'longrope'") - elif rope_scaling["rope_type"] == "mrope": - assert "mrope_section" in rope_scaling - rope_scaling["rope_type"] = "default" + elif rope_parameters["rope_type"] == "mrope": + assert "mrope_section" in rope_parameters + rope_parameters["rope_type"] = "default" logger.warning("Replacing legacy rope_type 'mrope' with 'default'") def _uses_mrope(config: PretrainedConfig) -> bool: - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is None: + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is None: return False - return "mrope_section" in rope_scaling + return "mrope_section" in rope_parameters def uses_mrope(config: PretrainedConfig) -> bool: From 50b1a870cac0266bc3e6fc8dafcbaeaed53c7ae4 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:26:52 +0100 Subject: [PATCH 08/70] Fix `patch_rope_parameters` for when `rope_scaling` was explicitly `None` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6b2409f0f01a..129c7ef91aa4 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -398,13 +398,14 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: rope_parameters_keys = ("rope_parameters", "rope_scaling") rope_parameters = getattr_iter(text_config, rope_parameters_keys, None) + # Forward compatibility for Transformers v5 + # (can be removed once Transformers v4 is no longer supported) + cls_attr = getattr(type(text_config), "rope_scaling", None) + if not isinstance(cls_attr, property): + text_config.rope_parameters = rope_parameters + delattr(text_config, "rope_scaling") + if rope_parameters is not None: - # Forward compatibility for Transformers v5 - # (can be removed once Transformers v4 is no longer supported) - cls_attr = getattr(type(text_config), "rope_scaling", None) - if not isinstance(cls_attr, property): - text_config.rope_parameters = rope_parameters - delattr(text_config, "rope_scaling") # Handle nested rope_parameters in interleaved sliding attention models if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): for rope_parameters_layer_type in rope_parameters.values(): From bd182e063ce03d12ef11f16d8b7aea87f7c8245b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:28:14 +0100 Subject: [PATCH 09/70] Update Gemma3 and Gemma3n Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gemma3.py | 28 +++++++++++++++++---------- vllm/model_executor/models/gemma3n.py | 27 ++++++++++++++++---------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 213f9f562f8a..721ab9bc4ca7 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -155,25 +155,33 @@ def __init__( self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) layer_idx = extract_layer_index(prefix) - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + self.is_sliding = layer_type == "sliding_attention" sliding_window = config.sliding_window if self.is_sliding else None # Initialize the rotary embedding. - if self.is_sliding: - # Local attention. Override the values in config.json. - self.rope_theta = config.rope_local_base_freq - self.rope_scaling = {"rope_type": "default"} + if config.rope_parameters and layer_type in config.rope_parameters: + # Transformers v5 + rope_parameters = config.rope_parameters[layer_type] + base = rope_parameters["rope_theta"] else: - # Global attention. Use the values in config.json. - self.rope_theta = config.rope_theta - self.rope_scaling = config.rope_scaling + # Transformers v4 + if self.is_sliding: + # Local attention. Override the values in config.json. + base = config.rope_local_base_freq + rope_parameters = {"rope_type": "default"} + else: + # Global attention. Use the values in config.json. + base = config.rope_theta + rope_parameters = config.rope_parameters + self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + base=base, is_neox_style=True, - rope_scaling=self.rope_scaling, + rope_parameters=rope_parameters, ) if getattr(config, "is_causal", True): diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 22d51ab76269..154043ca9983 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -332,18 +332,25 @@ def __init__( ) layer_idx = extract_layer_index(prefix) - is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + is_sliding = layer_type == "sliding_attention" self.sliding_window = config.sliding_window if is_sliding else None # Initialize the rotary embedding. - if is_sliding: - # Local attention. Override the values in config.json. - rope_theta = config.rope_local_base_freq - rope_scaling = {"rope_type": "default"} + if config.rope_parameters and layer_type in config.rope_parameters: + # Transformers v5 + rope_parameters = config.rope_parameters[layer_type] + base = rope_parameters["rope_theta"] else: - # Global attention. Use the values in config.json. - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling + # Transformers v4 + if is_sliding: + # Local attention. Override the values in config.json. + base = config.rope_local_base_freq + rope_parameters = {"rope_type": "default"} + else: + # Global attention. Use the values in config.json. + base = config.rope_theta + rope_parameters = config.rope_parameters first_kv_shared_layer_idx = ( config.num_hidden_layers - config.num_kv_shared_layers @@ -383,9 +390,9 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + base=base, is_neox_style=True, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( From 65c8658a52d54fe9d94ee24eb86cf0c1a2ba7393 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 11:18:24 +0100 Subject: [PATCH 10/70] Get `rope_theta` from the new location too Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 2 +- tests/kernels/core/test_mrope.py | 16 +++++++--------- vllm/model_executor/models/arctic.py | 2 +- vllm/model_executor/models/bailing_moe.py | 2 +- vllm/model_executor/models/gemma.py | 2 +- vllm/model_executor/models/gemma2.py | 2 +- vllm/model_executor/models/gemma3.py | 2 +- vllm/model_executor/models/gemma3n.py | 2 +- vllm/model_executor/models/gpt_oss.py | 4 ++-- vllm/model_executor/models/llama4.py | 2 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/olmo.py | 2 +- vllm/model_executor/models/persimmon.py | 2 +- vllm/model_executor/models/plamo2.py | 6 +++++- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 2 +- vllm/model_executor/models/step3_text.py | 2 +- vllm/model_executor/models/zamba2.py | 2 +- vllm/transformers_utils/config.py | 9 ++++++++- 20 files changed, 38 insertions(+), 29 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index b9036dad09d3..d445a1d69466 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -303,7 +303,7 @@ def benchmark_mrope( q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim is_neox_style = True - rope_theta = config.rope_theta + rope_theta = config.rope_parameters["rope_theta"] max_position = config.max_position_embeddings for num_tokens in num_tokens_list: diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 02b795721f46..5ccab51c2a24 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -5,11 +5,11 @@ import pytest import torch from packaging.version import Version -from transformers import AutoConfig from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -98,8 +98,7 @@ def test_mrope( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -113,7 +112,7 @@ def test_mrope( ) is_neox_style = True - rope_theta = config.rope_theta + rope_theta = config.rope_parameters["rope_theta"] max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -124,7 +123,7 @@ def test_mrope( max_position=max_position, base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) @@ -173,8 +172,7 @@ def test_mrope_torch_compile_tracing( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -187,7 +185,7 @@ def test_mrope_torch_compile_tracing( else config.hidden_size // total_num_heads ) is_neox_style = True - rope_theta = config.rope_theta + rope_theta = config.rope_parameters["rope_theta"] max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -198,7 +196,7 @@ def test_mrope_torch_compile_tracing( max_position=max_position, base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b5cc07a56535..760f72a6073a 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -292,7 +292,7 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] self.scaling = self.head_dim**-0.5 self.qkv_proj = QKVParallelLinear( diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 4f7ae42049e8..862dadc64b67 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -136,7 +136,7 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, + base=config.rope_parameters["rope_theta"], is_neox_style=True, rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 7aaae7c503b5..47542e3ed157 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -218,7 +218,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters["rope_theta"], cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4d5d6cbb37c6..c0df586aafc7 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -206,7 +206,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters["rope_theta"], cache_config=cache_config, quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index c785ae891afb..9eec8f93c8e8 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -172,7 +172,7 @@ def __init__( rope_parameters = {"rope_type": "default"} else: # Global attention. Use the values in config.json. - base = config.rope_theta + base = config.rope_parameters["rope_theta"] rope_parameters = config.rope_parameters self.rotary_emb = get_rope( diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index c3a6dd555aa7..c2faf2b21a76 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -349,7 +349,7 @@ def __init__( rope_parameters = {"rope_type": "default"} else: # Global attention. Use the values in config.json. - base = config.rope_theta + base = config.rope_parameters["rope_theta"] rope_parameters = config.rope_parameters first_kv_shared_layer_idx = ( diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index a4ac53e12baa..a7ef95f28763 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,7 +67,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, + base=config.rope_parameters["rope_theta"], dtype=torch.float32, rope_parameters={ "rope_type": "yarn", @@ -90,7 +90,7 @@ def __init__( self.q_size = self.num_attention_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] self.qkv_proj = QKVParallelLinear( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index dae85328d434..d23c672d9158 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -331,7 +331,7 @@ def __init__( self.layer_idx = extract_layer_index(prefix) self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size - rope_theta = config.rope_theta + rope_theta = config.rope_parameters["rope_theta"] rope_parameters = config.rope_parameters max_position_embeddings = config.max_position_embeddings diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 5422fb443ead..ac234e6a47ec 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -291,7 +291,7 @@ def __init__( rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, - base=config.rope_theta, + base=config.rope_parameters["rope_theta"], rope_parameters={"rope_type": "mllama4"}, is_neox_style=False, dtype=torch.complex64, # important diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ab83a271e30a..0853d8ac0257 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -410,7 +410,7 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 487e3f671a45..7700b55d605b 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -87,7 +87,7 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] self.clip_qkv = config.clip_qkv # Attention input projection. Projects x -> (q, k, v) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3bf6a1d9763d..1aeca4c048d8 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -106,7 +106,7 @@ def __init__( self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 4e60a182d7e3..5c0de87221db 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -576,7 +576,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No prefix=f"{prefix}.o_proj", ) - self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000 + self.rope_theta = ( + config.rope_parameters["rope_theta"] + if hasattr(config, "rope_parameters") + else 10000 + ) self.rope_parameters = getattr(config, "rope_parameters", None) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 06eb7201c1a8..00a859aa4d5e 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -153,7 +153,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, + base=self.config.rope_parameters["rope_theta"], partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 0f2942acd500..d9f972287a97 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -91,7 +91,7 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 33a1122f8bd7..13609e773200 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -247,7 +247,7 @@ def __init__( max_position_embedding=config.max_position_embedding, head_dim=config.head_dim, share_q_dim=config.share_q_dim, - rope_theta=config.rope_theta, + rope_theta=config.rope_parameters["rope_theta"], rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", ) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 5443348ebbcb..5c708a35682d 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -128,7 +128,7 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.config = config self.num_hybrid_layers = num_hybrid_layers - self.rope_theta = config.rope_theta + self.rope_theta = config.rope_parameters["rope_theta"] self.attention_hidden_size = config.attention_hidden_size self.total_num_attention_heads = config.num_attention_heads diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6ea234d9cdc3..684ef12bdae7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -396,12 +396,19 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: text_config = config.get_text_config() # (Transformers v5, Transformers v4) rope_parameters_keys = ("rope_parameters", "rope_scaling") - rope_parameters = getattr_iter(text_config, rope_parameters_keys, None) + rope_parameters: dict | None = getattr_iter(text_config, rope_parameters_keys, None) # Forward compatibility for Transformers v5 # (can be removed once Transformers v4 is no longer supported) cls_attr = getattr(type(text_config), "rope_scaling", None) if not isinstance(cls_attr, property): + # rope_theta now lives in rope_parameters + if rope_theta := getattr(text_config, "rope_theta", None): + # Ensure rope_parameters exists if rope_theta is set + rope_parameters = rope_parameters or {} + rope_parameters["rope_theta"] = rope_theta + delattr(text_config, "rope_theta") + # Move rope config from rope_scaling to rope_parameters text_config.rope_parameters = rope_parameters delattr(text_config, "rope_scaling") From 5d657391b1dfa1ea0e367c56de0f8c6044160421 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 11:18:47 +0100 Subject: [PATCH 11/70] Fix condition for non gemma3 models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 84b3b7f6a41a..866b5fe42bfb 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2075,7 +2075,7 @@ def _get_and_verify_max_len( if rope_parameters is not None and "gemma3" not in hf_config.model_type: # In Transformers v5 this could be RopeParameters or dict[str, RopeParameters] # To simplify, we convert any RopeParameters to dict[str, RopeParameters] - if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): rope_parameters = {"": rope_parameters} for rp in rope_parameters.values(): rope_type = rp["rope_type"] From b4e1967cfcd94b53edf233f0dc7cc89a4dddf878 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 11:27:31 +0100 Subject: [PATCH 12/70] Make Transformers backend torch compile check work with new rope params Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index 267a6e06e6bb..140ba48eaf02 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -22,6 +22,7 @@ import torch from torch import nn +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from vllm.config.utils import getattr_iter from vllm.logger import init_logger @@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: """ text_config = vllm_config.model_config.hf_config.get_text_config() # Dynamic rope scaling is not compatible with torch.compile - rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {} - return rope_scaling.get("rope_type") != "dynamic" + rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {} + if rope_parameters: + # Nest rope_parameters if not nested already to simplify logic + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters = {"": rope_parameters} + return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values()) + return True From ee77bd7be6c3b2f3c9d63cadc8a1f91f85169e69 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 11:36:26 +0100 Subject: [PATCH 13/70] Re-enable a load of Transformers nightly tests which are now fixed Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index be1b79ddc432..c2be9ab4e758 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -874,12 +874,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper From df4c00757318195f26768af11e9a09b7741d2fa0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 12:00:27 +0100 Subject: [PATCH 14/70] Update the custom configs Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/config.py | 29 ++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 66b246878b0a..69211b5d1e6a 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -42,12 +42,13 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads + rope_parameters = getattr(config, "rope_parameters", None) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "base": rope_parameters["rope_theta"], + "rope_parameters": rope_parameters, } @@ -78,12 +79,16 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if not model_config.enforce_eager: max_position = round_up(max_position, 8) + rope_parameters = getattr(config, "rope_parameters", None) or {} + rope_theta = getattr(config, "rope_theta", config.rotary_emb_base) + rope_parameters["rope_theta"] = rope_theta + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "base": rope_parameters["rope_theta"], + "rope_parameters": rope_parameters, } @@ -117,18 +122,21 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) + rope_parameters = getattr(config, "rope_parameters", None) or {} + rope_theta = getattr(config, "rope_theta", config.rotary_emb_base) + rope_parameters["rope_theta"] = rope_theta config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "base": rope_parameters["rope_theta"], + "rope_parameters": rope_parameters, } # we ignore config.rotary_scaling_factor so that for datasets shorter # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. - # The context extension uses vllm style rope_theta and rope_scaling. + # The context extension uses vllm style rope_theta and rope_parameters. # See #17785 #18755 if ( not vllm_config.model_config.hf_overrides @@ -172,7 +180,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") hf_text_config.max_position_embeddings = max_trained_positions - hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"] # The priority of sentence_bert_config.json is higher # than max_position_embeddings @@ -242,12 +250,13 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads + rope_parameters = getattr(config, "rope_parameters", None) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "base": rope_parameters["rope_theta"], + "rope_parameters": rope_parameters, } From 325ff8d27cf825b59ac8722388cd48244fdbe1f0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 12:04:56 +0100 Subject: [PATCH 15/70] Make sure scaling factor always exists Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config/model.py b/vllm/config/model.py index 866b5fe42bfb..b51c6f30766d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2079,6 +2079,7 @@ def _get_and_verify_max_len( rope_parameters = {"": rope_parameters} for rp in rope_parameters.values(): rope_type = rp["rope_type"] + scaling_factor = 1.0 if rope_type not in ("su", "longrope", "llama3"): if disable_sliding_window: From 11c23a726f9bf43c28dec31cb88d7b036d506e17 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 12:29:03 +0100 Subject: [PATCH 16/70] A couple more models that now init on v5 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c2be9ab4e758..69bce731cf8c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -874,7 +874,7 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py From 4ea113c69b99203cf52834180cc075ee64491b2e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 12:42:49 +0100 Subject: [PATCH 17/70] Update Commandr Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/commandr.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 77bb17851981..4bb14233614f 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -156,8 +156,6 @@ def __init__( self.max_position_embeddings = getattr( config, "model_max_length", None ) or getattr(config, "max_position_embeddings", 8192) - self.rope_theta = config.rope_theta - self.rope_scaling = getattr(config, "rope_scaling", None) self.use_qk_norm = getattr(config, "use_qk_norm", False) self.qkv_proj = QKVParallelLinear( self.hidden_size, @@ -175,23 +173,29 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=self.rope_scaling, - is_neox_style=False, - ) # Model v2 has interleaved sliding windows, v1 does not self.v1 = isinstance(config, CohereConfig) self.sliding_window = None + rope_parameters = getattr(config, "rope_parameters", None) if not self.v1: layer_idx = extract_layer_index(prefix) - if config.layer_types[layer_idx] == "sliding_attention": + layer_type = config.layer_types[layer_idx] + if layer_type == "sliding_attention": self.sliding_window = config.sliding_window + if config.rope_parameters and layer_type in rope_parameters: + # Transformers v5 + rope_parameters = rope_parameters[layer_type] + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=rope_parameters["rope_theta"], + rope_parameters=rope_parameters, + is_neox_style=False, + ) self.attn = Attention( self.num_heads, From 59b0f2700067f77ea03c8cf60e5fd93623f01c70 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 12:49:48 +0100 Subject: [PATCH 18/70] Update Qwen3Next Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/qwen3_next.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 86508a7c6431..34ed6e5524b8 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -753,8 +753,8 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + base=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, partial_rotary_factor=config.partial_rotary_factor, dual_chunk_attention_config=self.dual_chunk_attention_config, ) From 064441bebede20487e2d96324657a1ea0653e2cc Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 12:56:06 +0100 Subject: [PATCH 19/70] Update Olmo2 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/olmo2.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 045582c889ee..a9ec6b501837 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -99,7 +99,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = self.config.max_position_embeddings - self.rope_theta = self.config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -123,10 +122,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer_idx = extract_layer_index(prefix) sliding_window = None - if ( - layer_types := getattr(self.config, "layer_types", None) - ) is not None and layer_types[layer_idx] == "sliding_attention": - sliding_window = self.config.sliding_window + rope_parameters = {"rope_theta": self.config.rope_parameters["rope_theta"]} + if layer_types := getattr(self.config, "layer_types", None): + layer_type = layer_types[layer_idx] + if layer_type == "sliding_attention": + sliding_window = self.config.sliding_window + elif layer_type == "full_attention": + # Rope scaling is only applied on full attention layers. + rope_parameters.update(self.config.rope_parameters) self.attn = Attention( self.num_heads, @@ -139,15 +142,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=f"{prefix}.attn", ) - # Rotary embeddings. Rope scaling is only applied on full attention - # layers. - self.rope_scaling = self.config.rope_scaling if sliding_window is None else None + # Rotary embeddings. self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, # type: ignore - rope_scaling=self.rope_scaling, + base=rope_parameters["rope_theta"], + rope_parameters=rope_parameters, ) # Attention output projection. From bdd0e6c587af67073acf07b58177d9013ebba415 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:00:18 +0100 Subject: [PATCH 20/70] rope_parameters always present because of rope_theta Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/commandr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 4bb14233614f..872a13f53cb4 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -178,13 +178,13 @@ def __init__( self.v1 = isinstance(config, CohereConfig) self.sliding_window = None - rope_parameters = getattr(config, "rope_parameters", None) + rope_parameters = config.rope_parameters if not self.v1: layer_idx = extract_layer_index(prefix) layer_type = config.layer_types[layer_idx] if layer_type == "sliding_attention": self.sliding_window = config.sliding_window - if config.rope_parameters and layer_type in rope_parameters: + if layer_type in rope_parameters: # Transformers v5 rope_parameters = rope_parameters[layer_type] From f224ef4c1ba0595b93c86450082b6938aeb76a8d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:07:52 +0100 Subject: [PATCH 21/70] Update LFM2MoE Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/lfm2_moe.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6b7b5564ee98..1d9e660651dc 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -190,7 +190,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -243,7 +243,7 @@ def __init__( rotary_dim=self.head_dim, max_position=self.max_position_embeddings, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -293,14 +293,8 @@ def __init__( self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2MoeAttention( @@ -309,8 +303,8 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, From 19dcc189128464decf4fcb02e97b89e5426490ae Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:09:11 +0100 Subject: [PATCH 22/70] Update LFM2 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/lfm2.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index aeb25602f11a..a507875adcf7 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -97,7 +97,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -150,7 +150,7 @@ def __init__( rotary_dim=self.head_dim, max_position=self.max_position_embeddings, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -199,14 +199,8 @@ def __init__( self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2Attention( @@ -215,8 +209,8 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, From 2eecd312107c566dcaf0505c114323af030c8396 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:18:28 +0100 Subject: [PATCH 23/70] Update the rest Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/apertus.py | 22 +++++------- vllm/model_executor/models/deepseek_v2.py | 34 +++++++++---------- vllm/model_executor/models/exaone4.py | 18 ++++------ .../model_executor/models/granitemoehybrid.py | 6 ++-- vllm/model_executor/models/llama.py | 22 +++++------- 5 files changed, 40 insertions(+), 62 deletions(-) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 0a8f21abb0a3..d118a249c9a9 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -119,7 +119,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -177,7 +177,7 @@ def __init__( ) self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config + config, rope_parameters=rope_parameters, quant_config=quant_config ) sliding_window = None @@ -224,7 +224,7 @@ def forward( def _init_rotary_emb( self, config: ApertusConfig, - rope_scaling: dict[str, Any] | None, + rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -237,7 +237,7 @@ def _init_rotary_emb( rotary_dim=int(self.partial_rotary_factor * self.head_dim), max_position=self.max_position_embeddings, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -253,14 +253,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -288,8 +282,8 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 115818d903a6..ab9115c45793 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -410,7 +410,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -485,21 +485,21 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if rope_parameters: + rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if rope_parameters: + mscale_all_dim = rope_parameters.get("mscale_all_dim", False) + scaling_factor = rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -904,7 +904,7 @@ def __init__( q_lora_rank: int | None, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -981,19 +981,19 @@ def __init__( prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if rope_parameters: + rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if rope_parameters: + mscale_all_dim = rope_parameters.get("mscale_all_dim", False) + scaling_factor = rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -1073,8 +1073,6 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) moe_layer_freq = getattr(config, "moe_layer_freq", 1) # DecoderLayers are created with `make_layers` which passes the prefix @@ -1107,8 +1105,8 @@ def __init__( v_head_dim=v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index b89e168ada20..5c61ea23b1aa 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -111,7 +111,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 1000000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -181,7 +181,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -227,14 +227,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -249,8 +243,8 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 05177f1d1ac2..3b6af559058d 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -274,10 +274,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=int(config.rope_theta), - rope_scaling=config.rope_scaling - if hasattr(config, "rope_scaling") and config.rope_scaling is not None - else None, + base=int(config.rope_parameters["rope_theta"]), + rope_parameters=config.rope_parameters, is_neox_style=True, ) else: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c49a1ea817f9..54efcd817a6f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -121,7 +121,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -187,7 +187,7 @@ def __init__( ) self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config + config, rope_parameters=rope_parameters, quant_config=quant_config ) sliding_window = None @@ -258,7 +258,7 @@ def forward( def _init_rotary_emb( self, config: LlamaConfig, - rope_scaling: dict[str, Any] | None, + rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -271,7 +271,7 @@ def _init_rotary_emb( rotary_dim=self.head_dim, max_position=self.max_position_embeddings, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -291,14 +291,8 @@ def __init__( quant_config = self.get_quant_config(vllm_config) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -326,8 +320,8 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, From e95ccd4deaa610d58d1a0d2656949932411bef5c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:29:20 +0100 Subject: [PATCH 24/70] update tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/kernels/core/test_pos_encoding.py | 20 +++++++++---------- .../pooling/test_nomic_max_model_len.py | 6 +++--- tests/test_config.py | 13 ++++++------ 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index c35ee5016ba0..ef4c0adb8d31 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -121,7 +121,7 @@ def test_rotary_embedding( def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] BASES = [10000, 1000000] - ROPE_SCALINGS = ( + ROPE_PARAMETERS = ( None, {"rope_type": "linear", "factor": (1,)}, {"rope_type": "dynamic", "factor": 1}, @@ -132,7 +132,7 @@ def test_rope_module_cache(): MAX_POSITIONS, BASES, IS_NEOX_STYLE, - ROPE_SCALINGS, + ROPE_PARAMETERS, DTYPES, ) rope_setting_id_map: dict[str, int] = {} @@ -142,8 +142,8 @@ def test_rope_module_cache(): rotary_dim, max_position, base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: @@ -153,8 +153,8 @@ def test_rope_module_cache(): rotary_dim, max_position, base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # different settings cannot share the same rope module @@ -169,8 +169,8 @@ def test_rope_module_cache(): rotary_dim, max_position, base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: @@ -180,8 +180,8 @@ def test_rope_module_cache(): rotary_dim, max_position, base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # check if cache take effect diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 88f088c60327..589a62dfd6a7 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -80,7 +80,7 @@ def test_set_max_model_len_illegal(model_info, vllm_runner): def test_use_rope_scaling_legal(model_info, vllm_runner): hf_overrides = { "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -98,7 +98,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): def test_use_rope_scaling_illegal(model_info, vllm_runner): hf_overrides = { "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -116,7 +116,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): hf_overrides = { "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/test_config.py b/tests/test_config.py index bba2fbec3db2..de36463a6f99 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -254,19 +254,20 @@ def test_rope_customization(): LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct") - assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None + assert getattr(llama_model_config.hf_config, "rope_parameters", None) is None assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000 assert llama_model_config.max_model_len == 8192 llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, + "rope_parameters": TEST_ROPE_SCALING, "rope_theta": TEST_ROPE_THETA, }, ) assert ( - getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING + getattr(llama_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_SCALING ) assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 @@ -274,7 +275,7 @@ def test_rope_customization(): longchat_model_config = ModelConfig("lmsys/longchat-13b-16k") # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config assert all( - longchat_model_config.hf_config.rope_scaling.get(key) == value + longchat_model_config.hf_config.rope_parameters.get(key) == value for key, value in LONGCHAT_ROPE_SCALING.items() ) assert longchat_model_config.max_model_len == 16384 @@ -282,11 +283,11 @@ def test_rope_customization(): longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, + "rope_parameters": TEST_ROPE_SCALING, }, ) assert ( - getattr(longchat_model_config.hf_config, "rope_scaling", None) + getattr(longchat_model_config.hf_config, "rope_parameters", None) == TEST_ROPE_SCALING ) assert longchat_model_config.max_model_len == 4096 From f2bac1564781c2f7e0c29566ca7563e8dd92ddb5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:37:23 +0100 Subject: [PATCH 25/70] Update configs Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../moe/test_gpt_oss_triton_kernels.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/openpangu.py | 14 +++---- vllm/transformers_utils/configs/flex_olmo.py | 10 +++-- .../transformers_utils/configs/kimi_linear.py | 6 ++- .../transformers_utils/configs/midashenglm.py | 2 +- vllm/transformers_utils/configs/mistral.py | 4 +- vllm/transformers_utils/configs/nemotron.py | 41 +++++++++++-------- vllm/transformers_utils/configs/olmo3.py | 6 ++- 9 files changed, 50 insertions(+), 39 deletions(-) diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index dfd317bcf72f..af33fd4e3fc3 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -201,7 +201,7 @@ class ModelConfig: sliding_window: int = 128 initial_context_length: int = 4096 rope_theta: float = 150000.0 - rope_scaling_factor: float = 32.0 + rope_parameters_factor: float = 32.0 rope_ntk_alpha: float = 1.0 rope_ntk_beta: float = 32.0 diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index ab9115c45793..2ac7d7892721 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -112,7 +112,7 @@ def __init__( hidden_size: int, num_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -163,7 +163,7 @@ def __init__( rotary_dim=self.head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index d13a745beffe..373c1fe2f036 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -339,7 +339,7 @@ def __init__( ) # TODO: remove hard coding - rope_scaling = { + rope_parameters = { "beta_fast": 32, "beta_slow": 1, "factor": 1, @@ -354,7 +354,7 @@ def __init__( rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=False, ) @@ -408,7 +408,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -476,7 +476,7 @@ def __init__( ) self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config + config, rope_parameters=rope_parameters, quant_config=quant_config ) if hasattr(config, "interleaved_sliding_window"): @@ -521,7 +521,7 @@ def forward( def _init_rotary_emb( self, config: PretrainedConfig, - rope_scaling: dict[str, Any] | None, + rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -534,7 +534,7 @@ def _init_rotary_emb( rotary_dim=self.head_dim, max_position=self.max_position_embeddings, base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) @@ -608,7 +608,7 @@ def __init__( config, "num_key_value_heads", config.num_attention_heads ), rope_theta=rope_theta, - rope_scaling=getattr(config, "rope_scaling", None), + rope_parameters=getattr(config, "rope_parameters", None), max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py index 1f2f4d446288..30acfce5480c 100644 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ b/vllm/transformers_utils/configs/flex_olmo.py @@ -26,7 +26,7 @@ def __init__( eos_token_id=100257, tie_word_embeddings=False, rope_theta=500000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, num_experts_per_tok=5, @@ -63,7 +63,9 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -73,5 +75,5 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Validate the correctness of rotary position embeddings parameters # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + if self.rope_parameters is not None and "type" in self.rope_parameters: + self.rope_parameters["rope_type"] = self.rope_parameters["type"] diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py index 65ddf48c5249..cad8449f2443 100644 --- a/vllm/transformers_utils/configs/kimi_linear.py +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -30,7 +30,7 @@ def __init__( bos_token_id=1, eos_token_id=2, rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, tie_word_embeddings=False, moe_intermediate_size: int | None = None, moe_renormalize: bool = True, @@ -74,7 +74,9 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py index e49bd26b2b00..f1bbd057103e 100644 --- a/vllm/transformers_utils/configs/midashenglm.py +++ b/vllm/transformers_utils/configs/midashenglm.py @@ -98,6 +98,6 @@ def __init__( if text_config else Qwen2_5OmniTextConfig() ) - self.text_config.rope_scaling = None # uses_mrope is false + self.text_config.rope_parameters = None # uses_mrope is false self.audio_token_id = audio_token_id super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index c6f04febe37e..8f72f0b28b0d 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict: "apply_scale": "apply_yarn_scaling", } yarn_config = config.get("yarn") or {} - config["rope_scaling"] = { + config["rope_parameters"] = { "rope_type": "yarn", "mscale_all_dim": 1, } for old_name, new_name in yarn_config_map.items(): if old_name in yarn_config: - config["rope_scaling"][new_name] = yarn_config.pop(old_name) + config["rope_parameters"][new_name] = yarn_config.pop(old_name) assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}" diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 60eed549561f..87a7239fda57 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -133,7 +133,7 @@ def __init__( eos_token_id=3, tie_word_embeddings=False, rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.5, attention_bias=False, attention_dropout=0.0, @@ -161,7 +161,9 @@ def __init__( self.norm_eps = norm_eps self.use_cache = use_cache self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # for backward compatibility partial_rotary_factor = ( kwargs.get("rope_percent") @@ -182,31 +184,34 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: raise ValueError( - "`rope_scaling` must be a dictionary with two fields, " - f"`type` and `factor`, got {self.rope_scaling}" + "`rope_parameters` must be a dictionary with two fields, " + f"`type` and `factor`, got {self.rope_parameters}" ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + rope_parameters_type = self.rope_parameters.get("type", None) + rope_parameters_factor = self.rope_parameters.get("factor", None) + if rope_parameters_type is None or rope_parameters_type not in [ + "linear", + "dynamic", + ]: raise ValueError( - "`rope_scaling`'s type field must be one of ['linear', " - f"'dynamic'], got {rope_scaling_type}" + "`rope_parameters`'s type field must be one of ['linear', " + f"'dynamic'], got {rope_parameters_type}" ) if ( - rope_scaling_factor is None - or not isinstance(rope_scaling_factor, float) - or rope_scaling_factor <= 1.0 + rope_parameters_factor is None + or not isinstance(rope_parameters_factor, float) + or rope_parameters_factor <= 1.0 ): raise ValueError( - "`rope_scaling`'s factor field must be a float > 1, got " - f"{rope_scaling_factor}" + "`rope_parameters`'s factor field must be a float > 1, got " + f"{rope_parameters_factor}" ) diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py index f5a9a7cd36bd..ed0dfdcbc6f5 100644 --- a/vllm/transformers_utils/configs/olmo3.py +++ b/vllm/transformers_utils/configs/olmo3.py @@ -25,7 +25,7 @@ def __init__( eos_token_id=50279, tie_word_embeddings=False, rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, rms_norm_eps=1e-5, @@ -64,7 +64,9 @@ def __init__( self.initializer_range = initializer_range self.use_cache = use_cache self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout From 36e8a1f8f932318801270c3c884317b26523eaf0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:38:32 +0100 Subject: [PATCH 26/70] Missed 2 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/configs/qwen3_next.py | 8 +++++--- vllm/transformers_utils/configs/step3_vl.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index 21750bde2f87..f9cf4cec438a 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -68,7 +68,7 @@ class Qwen3NextConfig(PretrainedConfig): Whether the model's input and output word embeddings should be tied. rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_parameters (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. @@ -200,7 +200,7 @@ def __init__( use_cache=True, tie_word_embeddings=False, rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.25, attention_bias=False, attention_dropout=0.0, @@ -237,7 +237,9 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py index 637b82d88e26..d3a49d7eafc3 100644 --- a/vllm/transformers_utils/configs/step3_vl.py +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -53,7 +53,7 @@ def __init__( moe_num_experts: int = 48, moe_top_k: int = 3, rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 65536, share_expert_dim: int = 5120, share_q_dim: int = 2048, @@ -131,7 +131,9 @@ def __init__( self.moe_num_experts = moe_num_experts self.moe_top_k = moe_top_k self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.max_position_embedding = max_position_embedding self.share_expert_dim = share_expert_dim self.share_q_dim = share_q_dim From dfa75cffc4e8e52cf5bf3753130725d0f1241f38 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:58:47 +0100 Subject: [PATCH 27/70] Improve comment about what `rope_parameters` is Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index b51c6f30766d..4f389f306706 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2073,8 +2073,9 @@ def _get_and_verify_max_len( # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # scaling, so we skip applying the scaling factor again. if rope_parameters is not None and "gemma3" not in hf_config.model_type: - # In Transformers v5 this could be RopeParameters or dict[str, RopeParameters] - # To simplify, we convert any RopeParameters to dict[str, RopeParameters] + # In Transformers v5 rope_parameters could be RopeParameters or + # dict[str, RopeParameters] where RopeParameters is a TypedDict. To simplify + # the verification, we convert any RopeParameters to dict[str, RopeParameters] if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): rope_parameters = {"": rope_parameters} for rp in rope_parameters.values(): From 708ea0c3d4753076138ed6f580b2103a46fa9266 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 13:59:06 +0100 Subject: [PATCH 28/70] Move scaling factor out of loop Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 4f389f306706..c70476d28e66 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2078,9 +2078,9 @@ def _get_and_verify_max_len( # the verification, we convert any RopeParameters to dict[str, RopeParameters] if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): rope_parameters = {"": rope_parameters} + scaling_factor = 1.0 for rp in rope_parameters.values(): rope_type = rp["rope_type"] - scaling_factor = 1.0 if rope_type not in ("su", "longrope", "llama3"): if disable_sliding_window: @@ -2091,14 +2091,14 @@ def _get_and_verify_max_len( "rope_parameters. Please raise an issue so we can investigate." ) - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rp.get("factor", 1.0) + # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + # NOTE: This assumes all layer types have the same scaling factor. + scaling_factor = rp.get("factor", scaling_factor) if rope_type == "yarn": derived_max_model_len = rp["original_max_position_embeddings"] - # Do this outside loop since all layers should have the same scaling - derived_max_model_len *= scaling_factor + # Do this outside loop since all layers should have the same scaling + derived_max_model_len *= scaling_factor if encoder_config and "max_seq_length" in encoder_config: derived_max_model_len = encoder_config["max_seq_length"] From 4a285129458a79ab422779fab6a4bb3d6ae88623 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 14:06:55 +0100 Subject: [PATCH 29/70] Early exit `patch_rope_parameters` if no rope params present Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 684ef12bdae7..192418fa7295 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -397,7 +397,9 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: # (Transformers v5, Transformers v4) rope_parameters_keys = ("rope_parameters", "rope_scaling") rope_parameters: dict | None = getattr_iter(text_config, rope_parameters_keys, None) - + # No rope parameters to patch + if rope_parameters is None: + return # Forward compatibility for Transformers v5 # (can be removed once Transformers v4 is no longer supported) cls_attr = getattr(type(text_config), "rope_scaling", None) @@ -412,13 +414,12 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: text_config.rope_parameters = rope_parameters delattr(text_config, "rope_scaling") - if rope_parameters is not None: - # Handle nested rope_parameters in interleaved sliding attention models - if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): - for rope_parameters_layer_type in rope_parameters.values(): - patch_rope_parameters_dict(rope_parameters_layer_type) - else: - patch_rope_parameters_dict(rope_parameters) + # Handle nested rope_parameters in interleaved sliding attention models + if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + for rope_parameters_layer_type in rope_parameters.values(): + patch_rope_parameters_dict(rope_parameters_layer_type) + else: + patch_rope_parameters_dict(rope_parameters) def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: From dfb476f7bad2bd67b5d6910439f61adf4421b134 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:09:50 +0100 Subject: [PATCH 30/70] Be more explicit about v4 vs v5 behaviour Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 45 ++++++++++++++++++------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 192418fa7295..83be6a404227 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -24,7 +24,9 @@ RepositoryNotFoundError, RevisionNotFoundError, ) +from packaging.version import Version from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( @@ -35,7 +37,6 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs -from vllm.config.utils import getattr_iter from vllm.logger import init_logger from vllm.transformers_utils.config_parser_base import ConfigParserBase from vllm.transformers_utils.utils import ( @@ -394,25 +395,33 @@ def file_or_path_exists( def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" text_config = config.get_text_config() - # (Transformers v5, Transformers v4) - rope_parameters_keys = ("rope_parameters", "rope_scaling") - rope_parameters: dict | None = getattr_iter(text_config, rope_parameters_keys, None) - # No rope parameters to patch - if rope_parameters is None: - return - # Forward compatibility for Transformers v5 - # (can be removed once Transformers v4 is no longer supported) - cls_attr = getattr(type(text_config), "rope_scaling", None) - if not isinstance(cls_attr, property): - # rope_theta now lives in rope_parameters - if rope_theta := getattr(text_config, "rope_theta", None): - # Ensure rope_parameters exists if rope_theta is set - rope_parameters = rope_parameters or {} + + if Version(TRANSFORMERS_VERSION) >= Version("5.0.0.dev0"): + from transformers.modeling_rope_utils import RopeParameters + + rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( + text_config, "rope_parameters", None + ) + else: + # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters + rope_theta: float | None = getattr(text_config, "rope_theta", None) + rope_scaling: dict | None = getattr(text_config, "rope_scaling", None) + rope_parameters = rope_scaling + # Move rope_theta into rope_parameters + if rope_theta is not None: + rope_parameters = rope_parameters or {"rope_type": "default"} rope_parameters["rope_theta"] = rope_theta - delattr(text_config, "rope_theta") - # Move rope config from rope_scaling to rope_parameters + # Write back to text_config text_config.rope_parameters = rope_parameters - delattr(text_config, "rope_scaling") + # Delete legacy attributes + if hasattr(text_config, "rope_theta"): + delattr(text_config, "rope_theta") + if hasattr(text_config, "rope_scaling"): + delattr(text_config, "rope_scaling") + + # No RoPE parameters to patch + if rope_parameters is None: + return # Handle nested rope_parameters in interleaved sliding attention models if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): From 97bb3394f665f1ab6fa7283c7dec7c8921c97dc2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:29:16 +0100 Subject: [PATCH 31/70] Update a few models to not pass `base` outside of `rope_parameters` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../layers/rotary_embedding/__init__.py | 27 +++++++------------ vllm/model_executor/models/arcee.py | 14 +++------- vllm/model_executor/models/baichuan.py | 9 +++---- vllm/model_executor/models/bamba.py | 3 --- vllm/model_executor/models/chameleon.py | 7 ----- vllm/model_executor/models/llama.py | 3 --- 6 files changed, 16 insertions(+), 47 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 229598c17720..30b4c1116896 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -26,24 +26,19 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - base: float, + rope_parameters: dict[str, Any], is_neox_style: bool = True, - rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() - if rope_parameters is not None: - # Transforms every value that is a list into a tuple for caching calls - rope_parameters_tuple = { - k: tuple(v) if isinstance(v, list) else v - for k, v in rope_parameters.items() - } - rope_parameters_args = tuple(rope_parameters_tuple.items()) - else: - rope_parameters_args = None + # Transforms every value that is a list into a tuple for caching calls + rope_parameters_tuple = { + k: tuple(v) if isinstance(v, list) else v for k, v in rope_parameters.items() + } + rope_parameters_args = tuple(rope_parameters_tuple.items()) if dual_chunk_attention_config is not None: dual_chunk_attention_tuple = { @@ -61,15 +56,15 @@ def get_rope( head_size, rotary_dim, max_position, - base, - is_neox_style, rope_parameters_args, + is_neox_style, dual_chunk_attention_args, dtype, ) if key in _ROPE_DICT: return _ROPE_DICT[key] + base = rope_parameters["rope_theta"] if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -85,10 +80,6 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_parameters: - rotary_emb = RotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, dtype - ) else: scaling_type = rope_parameters["rope_type"] @@ -147,7 +138,7 @@ def get_rope( ) elif scaling_type == "ntk": scaling_factor = rope_parameters["factor"] - mixed_b = rope_parameters.get("mixed_b", None) + mixed_b = rope_parameters.get("mixed_b") rotary_emb = NTKScalingRotaryEmbedding( head_size, rotary_dim, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index e091ae1f6b1b..49b81a5ea0c3 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -103,15 +103,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Rotary embedding parameters (reuse LLaMA defaults) - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Determine if attention bias is needed (some variants use bias terms) attention_bias = getattr(config, "attention_bias", False) or getattr( @@ -133,8 +126,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 8991ef4c606b..af21e41c8763 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -136,7 +136,7 @@ def __init__( hidden_size: int, num_heads: int, position_embedding: str, - rope_theta: float = 10000, + rope_parameters: dict | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -150,7 +150,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = hidden_size // self.total_num_heads self.position_embedding = position_embedding - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # pylint: disable=invalid-name @@ -192,7 +191,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( @@ -229,13 +228,13 @@ def __init__( ): super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) + rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = BaiChuanAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, position_embedding=position_embedding, - rope_theta=rope_theta, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 4007c6f8470c..53fdc30cda1c 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -157,7 +157,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size @@ -179,7 +178,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -194,7 +192,6 @@ def __init__( rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, - base=rope_theta, is_neox_style=True, dtype=torch.get_default_dtype(), # see impl of get_rope ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index cf84a9d2132e..ff0f07d3ce6c 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -264,7 +264,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, @@ -292,7 +291,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -317,7 +315,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) @@ -368,7 +365,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None @@ -384,7 +380,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, @@ -438,7 +433,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None @@ -454,7 +448,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 54efcd817a6f..547c06d310b0 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -120,7 +120,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, @@ -157,7 +156,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings llama_4_scaling_config = getattr(config, "llama_4_scaling", None) @@ -270,7 +268,6 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, rope_parameters=rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, From 97766f5e71aba32602fc46aa13b7609950b32d13 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:38:38 +0100 Subject: [PATCH 32/70] Update some more models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/bailing_moe.py | 3 +-- vllm/model_executor/models/commandr.py | 1 - vllm/model_executor/models/dbrx.py | 7 +++++-- vllm/model_executor/models/qwen2.py | 10 ++-------- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 3 +-- 6 files changed, 10 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 862dadc64b67..e53fa43d0bb4 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -136,9 +136,8 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_parameters["rope_theta"], - is_neox_style=True, rope_parameters=config.rope_parameters, + is_neox_style=True, partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 872a13f53cb4..73ed86c87908 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -192,7 +192,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=rope_parameters["rope_theta"], rope_parameters=rope_parameters, is_neox_style=False, ) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 528ef4f76742..2c729019081a 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -197,7 +197,10 @@ def __init__( self.head_dim = self.d_model // self.total_num_heads self.total_num_kv_heads = config.attn_config.kv_n_heads self.clip_qkv = config.attn_config.clip_qkv - self.rope_theta = config.attn_config.rope_theta + rope_parameters = { + "rope_type": "default", + "rope_theta": int(config.attn_config.rope_theta), + } self.max_position = config.max_seq_len # pylint: disable=invalid-name @@ -221,7 +224,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 2ea213728ef1..5a0654fb7383 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -114,11 +114,10 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -143,7 +142,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -167,7 +165,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) @@ -217,8 +214,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -237,10 +232,9 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 00a859aa4d5e..a738fcbb4ee2 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -153,7 +153,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, - base=self.config.rope_parameters["rope_theta"], + rope_parameters=self.config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index d9f972287a97..1118fca3cac9 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -91,7 +91,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_parameters["rope_theta"] self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias @@ -115,7 +114,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( From 783962ba0f1b8512ade96b4ebe27f35349d1de7b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:52:52 +0100 Subject: [PATCH 33/70] Update some more models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/apertus.py | 6 +---- vllm/model_executor/models/arctic.py | 3 +-- vllm/model_executor/models/config.py | 8 ++----- vllm/model_executor/models/deepseek_v2.py | 16 +++----------- vllm/model_executor/models/dots1.py | 11 +--------- vllm/model_executor/models/ernie45_moe.py | 12 +++------- vllm/model_executor/models/ernie45_vl_moe.py | 11 +++------- vllm/model_executor/models/exaone.py | 23 +++----------------- vllm/model_executor/models/exaone4.py | 9 +------- vllm/model_executor/models/falcon.py | 3 +-- vllm/model_executor/models/falcon_h1.py | 6 +---- vllm/model_executor/models/gemma.py | 8 +++---- vllm/model_executor/models/gemma2.py | 5 +---- vllm/model_executor/models/gemma3.py | 16 +++++--------- vllm/model_executor/models/gemma3n.py | 16 +++++--------- vllm/model_executor/models/qwen3_next.py | 1 - 16 files changed, 37 insertions(+), 117 deletions(-) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index d118a249c9a9..d76d03427364 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -118,8 +118,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -155,7 +154,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -236,7 +234,6 @@ def _init_rotary_emb( self.head_dim, rotary_dim=int(self.partial_rotary_factor * self.head_dim), max_position=self.max_position_embeddings, - base=self.rope_theta, rope_parameters=rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, @@ -282,7 +279,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=config.rope_parameters["rope_theta"], rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 760f72a6073a..b75a254761d4 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -292,7 +292,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_parameters["rope_theta"] self.scaling = self.head_dim**-0.5 self.qkv_proj = QKVParallelLinear( @@ -317,7 +316,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 69211b5d1e6a..c5e9c876d68b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -42,13 +42,11 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads - rope_parameters = getattr(config, "rope_parameters", None) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": rope_parameters["rope_theta"], - "rope_parameters": rope_parameters, + "rope_parameters": config.rope_parameters, } @@ -250,13 +248,11 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads - rope_parameters = getattr(config, "rope_parameters", None) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": rope_parameters["rope_theta"], - "rope_parameters": rope_parameters, + "rope_parameters": config.rope_parameters, } diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 2ac7d7892721..53e4b507c086 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -111,8 +111,7 @@ def __init__( config: DeepseekV2Config | DeepseekV3Config, hidden_size: int, num_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -139,7 +138,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -162,7 +160,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) self.attn = Attention( @@ -409,8 +406,7 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -430,7 +426,6 @@ def __init__( assert num_heads % tp_size == 0 self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings assert topk_indices_buffer is None, ( "topk_indices_buffer is not \ @@ -492,7 +487,6 @@ def __init__( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, is_neox_style=False, ) @@ -903,8 +897,7 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -927,7 +920,6 @@ def __init__( self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: @@ -987,7 +979,6 @@ def __init__( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, is_neox_style=False, ) @@ -1105,7 +1096,6 @@ def __init__( v_head_dim=v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=kv_lora_rank, - rope_theta=config.rope_parameters["rope_theta"], rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index f3616bda5133..e65c275106a4 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -202,8 +201,6 @@ def __init__( num_heads: int, num_kv_heads: int, config: Dots1Config, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -229,7 +226,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings attention_bias = config.attention_bias @@ -255,8 +251,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -296,8 +291,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) self.layer_idx = layer_idx @@ -307,8 +300,6 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, config=config, - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index e8a718036fad..9da7a9c425ba 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -232,9 +232,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, @@ -266,7 +265,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -291,9 +289,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - is_neox_style=False, rope_parameters=rope_parameters, + is_neox_style=False, ) self.attn = Attention( self.num_heads, @@ -333,16 +330,13 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) self.self_attn = Ernie4_5_MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 24150e7c5242..f609e0187d32 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -91,9 +91,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_parameters: dict[str, Any] | None = None, freq_allocation: int = 20, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, @@ -126,7 +125,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -155,7 +153,7 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position_embeddings=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=False, dtype=torch.get_default_dtype(), mrope_section=[h_rope, w_rope, t_rope], @@ -413,8 +411,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_parameters = getattr(config, "rope_parameters", None) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) @@ -423,8 +419,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, freq_allocation=freq_allocation, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index bee3cf57d821..1727775d452a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -113,8 +112,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -144,7 +141,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -173,8 +169,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -207,8 +202,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -221,8 +214,6 @@ def __init__( hidden_size=hidden_size, num_heads=num_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=bias, @@ -251,14 +242,8 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -272,8 +257,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 5c61ea23b1aa..5c74b7c1230e 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -23,7 +23,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -110,8 +109,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 1000000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -141,7 +138,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -180,8 +176,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -243,8 +238,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=config.rope_parameters["rope_theta"], - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 85acdff3d96b..dc2d51f340c8 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -164,13 +164,12 @@ def __init__( ) if self.use_rotary: - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 4b486084a914..8dba76180005 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -216,8 +216,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 1e11) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -242,7 +240,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -256,8 +253,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=None, # see impl of get_rope ) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 47542e3ed157..00c7f59a0809 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -20,6 +20,7 @@ from collections.abc import Iterable from functools import cache from itertools import islice +from typing import Any import torch from torch import nn @@ -127,8 +128,8 @@ def __init__( num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -153,7 +154,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -176,7 +176,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -218,7 +218,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_parameters["rope_theta"], + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index c0df586aafc7..9b6cfe693230 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -107,7 +107,6 @@ def __init__( num_kv_heads: int, head_dim: int, max_position_embeddings: int, - rope_theta: float, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, @@ -134,7 +133,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.query_pre_attn_scalar**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -156,7 +154,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, ) @@ -206,7 +204,6 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_parameters["rope_theta"], cache_config=cache_config, quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 9eec8f93c8e8..d3d26bc85c5e 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -163,25 +163,21 @@ def __init__( if config.rope_parameters and layer_type in config.rope_parameters: # Transformers v5 rope_parameters = config.rope_parameters[layer_type] - base = rope_parameters["rope_theta"] else: # Transformers v4 + + # Global attention. Use the values in config.json. + rope_parameters = config.rope_parameters + # Local attention. Override the values in config.json. if self.is_sliding: - # Local attention. Override the values in config.json. - base = config.rope_local_base_freq - rope_parameters = {"rope_type": "default"} - else: - # Global attention. Use the values in config.json. - base = config.rope_parameters["rope_theta"] - rope_parameters = config.rope_parameters + rope_parameters["rope_theta"] = config.rope_local_base_freq self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=base, - is_neox_style=True, rope_parameters=rope_parameters, + is_neox_style=True, ) if getattr(config, "is_causal", True): diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index c2faf2b21a76..d66bf6014615 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -340,17 +340,14 @@ def __init__( if config.rope_parameters and layer_type in config.rope_parameters: # Transformers v5 rope_parameters = config.rope_parameters[layer_type] - base = rope_parameters["rope_theta"] else: # Transformers v4 + + # Global attention. Use the values in config.json. + rope_parameters = config.rope_parameters + # Local attention. Override the values in config.json. if is_sliding: - # Local attention. Override the values in config.json. - base = config.rope_local_base_freq - rope_parameters = {"rope_type": "default"} - else: - # Global attention. Use the values in config.json. - base = config.rope_parameters["rope_theta"] - rope_parameters = config.rope_parameters + rope_parameters["rope_theta"] = config.rope_local_base_freq first_kv_shared_layer_idx = ( config.num_hidden_layers - config.num_kv_shared_layers @@ -390,9 +387,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=base, - is_neox_style=True, rope_parameters=rope_parameters, + is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 34ed6e5524b8..6809628d4f99 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -753,7 +753,6 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_parameters["rope_theta"], rope_parameters=config.rope_parameters, partial_rotary_factor=config.partial_rotary_factor, dual_chunk_attention_config=self.dual_chunk_attention_config, From 797fbeae12916b9e0fa57ce2e0018634f87d2bd3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 16:00:47 +0100 Subject: [PATCH 34/70] Add back `type` -> `rope_type` for legacy custom models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 2 ++ vllm/transformers_utils/config.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/vllm/config/model.py b/vllm/config/model.py index c70476d28e66..4a2b1e1465ce 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2080,6 +2080,8 @@ def _get_and_verify_max_len( rope_parameters = {"": rope_parameters} scaling_factor = 1.0 for rp in rope_parameters.values(): + # No need to consider "type" key because of patch_rope_parameters when + # loading HF config rope_type = rp["rope_type"] if rope_type not in ("su", "longrope", "llama3"): diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 83be6a404227..9c4dbb5dae10 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -432,6 +432,20 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: + if "rope_type" in rope_parameters and "type" in rope_parameters: + rope_type = rope_parameters["rope_type"] + rope_type_legacy = rope_parameters["type"] + if rope_type != rope_type_legacy: + raise ValueError( + f"Found conflicts between 'rope_type={rope_type}' (modern " + f"field) and 'type={rope_type_legacy}' (legacy field). " + "You should only specify one of them." + ) + + if "rope_type" not in rope_parameters and "type" in rope_parameters: + rope_parameters["rope_type"] = rope_parameters["type"] + logger.info("Replacing legacy 'type' key with 'rope_type'") + if "rope_type" not in rope_parameters: raise ValueError("rope_parameters should have a 'rope_type' key") From b780892013147fca7434ffa855d8056286b9b537 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 16:15:20 +0100 Subject: [PATCH 35/70] More models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/glm4.py | 9 +-------- vllm/model_executor/models/glm4_moe.py | 11 +---------- vllm/model_executor/models/gpt_j.py | 3 +-- vllm/model_executor/models/gpt_neox.py | 3 +-- vllm/model_executor/models/gpt_oss.py | 3 +-- vllm/model_executor/models/granite.py | 19 +++---------------- vllm/model_executor/models/granitemoe.py | 13 +++---------- .../model_executor/models/granitemoehybrid.py | 1 - .../model_executor/models/granitemoeshared.py | 6 +----- vllm/model_executor/models/grok1.py | 11 ++++++----- 10 files changed, 18 insertions(+), 61 deletions(-) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 58d7b3c77727..42276f848e8b 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -57,7 +57,6 @@ def __init__( max_position: int = 4096 * 32, head_dim: int | None = None, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, rope_parameters: tuple | None = None, @@ -86,7 +85,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -107,8 +105,7 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=self.rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, is_neox_style=False, ) @@ -150,8 +147,6 @@ def __init__( quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = Glm4Attention( config=config, @@ -159,12 +154,10 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 69453b9b8c20..5aa51af54a00 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -26,7 +26,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -233,8 +232,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -264,7 +261,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = use_qk_norm @@ -291,8 +287,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( @@ -341,8 +336,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. @@ -354,8 +347,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index e416ecde0c1e..e94de8952fa6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,13 +95,12 @@ def __init__( scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index af0c9209231c..815c2fba4d9f 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,13 +92,12 @@ def __init__( scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index a7ef95f28763..33daf407435d 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,10 +67,10 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_parameters["rope_theta"], dtype=torch.float32, rope_parameters={ "rope_type": "yarn", + "rope_base": config.rope_parameters["rope_theta"], "factor": config.rope_parameters["factor"], "original_max_position_embeddings": config.rope_parameters[ "original_max_position_embeddings" @@ -90,7 +90,6 @@ def __init__( self.q_size = self.num_attention_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_parameters["rope_theta"] self.qkv_proj = QKVParallelLinear( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 391e42dd8631..abc7d18edb10 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -112,8 +111,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -143,7 +140,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.attention_multiplier - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -167,8 +163,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,14 +199,8 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -225,8 +214,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 844274789bca..300d53369ec9 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -140,9 +140,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 4096 * 32, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -172,7 +171,6 @@ def __init__( if attention_multiplier is not None else self.head_dim**-1 ) - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -194,9 +192,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), - is_neox_style=True, rope_parameters=rope_parameters, + is_neox_style=True, ) self.attn = Attention( self.num_heads, @@ -235,16 +232,12 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 3b6af559058d..1a952107948e 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -274,7 +274,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=int(config.rope_parameters["rope_theta"]), rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index d8ed10f6b379..fd346db7e35a 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -84,16 +84,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 9dc231863f74..545b8f13988e 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -25,6 +25,7 @@ from collections.abc import Iterable from itertools import islice +from typing import Any import torch import torch.nn.functional as F @@ -133,8 +134,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -161,7 +162,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -183,7 +183,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) @@ -236,13 +236,14 @@ def __init__( # Requires transformers > 4.32.0 # Default rope_theta value if not in config - rope_theta = 10000 + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = 10000 self.attn = Grok1Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", From ad9dff2bb94c43105af0f02e2574d2eae1a802a6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 16:31:21 +0100 Subject: [PATCH 36/70] Fix docs build Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 9c4dbb5dae10..8f02aaa10a5b 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -7,6 +7,7 @@ from collections.abc import Callable from dataclasses import asdict from functools import cache, partial +from importlib.metadata import version from pathlib import Path from typing import Any, Literal, TypeVar @@ -26,7 +27,6 @@ ) from packaging.version import Version from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig -from transformers import __version__ as TRANSFORMERS_VERSION from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( @@ -396,7 +396,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" text_config = config.get_text_config() - if Version(TRANSFORMERS_VERSION) >= Version("5.0.0.dev0"): + if Version(version("transformers")) >= Version("5.0.0.dev0"): from transformers.modeling_rope_utils import RopeParameters rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( From 461ff9476384f5dc479cbaf2b756b58aba0e0167 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 16:38:27 +0100 Subject: [PATCH 37/70] Update some more models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/plamo2.py | 12 ++++-------- vllm/model_executor/models/solar.py | 19 +++---------------- vllm/model_executor/models/step3_text.py | 14 +++++--------- vllm/model_executor/models/zamba2.py | 4 +--- vllm/transformers_utils/configs/arctic.py | 12 ++++++++++-- vllm/transformers_utils/configs/flex_olmo.py | 13 +++++++++---- 6 files changed, 32 insertions(+), 42 deletions(-) diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 5c0de87221db..6de7f71bd2c3 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -576,12 +576,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No prefix=f"{prefix}.o_proj", ) - self.rope_theta = ( - config.rope_parameters["rope_theta"] - if hasattr(config, "rope_parameters") - else 10000 - ) - self.rope_parameters = getattr(config, "rope_parameters", None) + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = 10000 + max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -592,8 +589,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_parameters=self.rope_parameters, + rope_parameters=config.rope_parameters, ) self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps) self.q_norm.weight = torch.nn.Parameter( diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index b400806d2005..0ea25d96beea 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -25,7 +25,6 @@ """Inference-only Solar model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -142,7 +139,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -166,8 +162,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -202,15 +197,9 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -224,8 +213,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 13609e773200..3c377a2c539d 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -36,6 +36,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.step3_vl import Step3TextConfig from .interfaces import SupportsPP from .utils import ( @@ -144,9 +145,8 @@ def __init__( num_heads: int, num_kv_heads: int, norm_eps: float, - rope_theta: int, + rope_parameters: dict[str, Any], share_q_dim: int | None = None, - rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 8192, head_dim: int = 256, cache_config: CacheConfig | None = None, @@ -198,7 +198,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embedding, - base=rope_theta, rope_parameters=rope_parameters, ) scaling = self.head_dim**-0.5 @@ -227,15 +226,13 @@ def forward( class Step3TextDecoderLayer(nn.Module): def __init__( self, - config: ModelConfig, + config: Step3TextConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: super().__init__() - config = config.hf_config self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) self.self_attn = Step3TextAttention( hidden_size=self.hidden_size, @@ -247,8 +244,7 @@ def __init__( max_position_embedding=config.max_position_embedding, head_dim=config.head_dim, share_q_dim=config.share_q_dim, - rope_theta=config.rope_parameters["rope_theta"], - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) @@ -338,7 +334,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Step3TextDecoderLayer( - config=vllm_config.model_config, + config=config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 5c708a35682d..5f5f973e0c8d 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -128,7 +128,6 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.config = config self.num_hybrid_layers = num_hybrid_layers - self.rope_theta = config.rope_parameters["rope_theta"] self.attention_hidden_size = config.attention_hidden_size self.total_num_attention_heads = config.num_attention_heads @@ -233,8 +232,7 @@ def __init__( head_size=self.attention_head_dim, rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, - base=self.rope_theta, - rope_parameters=None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 1707e15285c8..480d0c4ea062 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -132,7 +132,7 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters: dict[str, Any] | None = None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=1, @@ -165,7 +165,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + rope_theta = kwargs.pop("rope_theta", 1e6) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + elif "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py index 30acfce5480c..1d2be4cd8af8 100644 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ b/vllm/transformers_utils/configs/flex_olmo.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -25,8 +26,7 @@ def __init__( bos_token_id=None, eos_token_id=100257, tie_word_embeddings=False, - rope_theta=500000.0, - rope_parameters=None, + rope_parameters: dict[str, Any] | None = None, attention_bias=False, attention_dropout=0.0, num_experts_per_tok=5, @@ -62,10 +62,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + rope_theta = kwargs.pop("rope_theta", 500000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + elif "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok From fa2ccedc52ee496a5f2bd28744afc439740ee014 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 17:20:39 +0100 Subject: [PATCH 38/70] Update some more models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 13 +++++---- vllm/model_executor/models/hunyuan_v1.py | 27 +++---------------- vllm/model_executor/models/internlm2.py | 10 ++----- vllm/model_executor/models/internlm2_ve.py | 5 +--- vllm/model_executor/models/kimi_linear.py | 4 +-- vllm/model_executor/models/lfm2.py | 6 +---- vllm/model_executor/models/lfm2_moe.py | 6 +---- vllm/model_executor/models/llama.py | 10 ++----- vllm/model_executor/models/llama4.py | 11 +------- vllm/model_executor/models/longcat_flash.py | 22 +++++++-------- vllm/model_executor/models/minicpm.py | 10 ++----- vllm/model_executor/models/minicpm3.py | 10 +------ vllm/model_executor/models/minicpm_eagle.py | 5 +--- vllm/model_executor/models/minimax_m2.py | 10 ++----- vllm/model_executor/models/minimax_text_01.py | 9 +++---- vllm/model_executor/models/mixtral.py | 6 +---- vllm/model_executor/models/molmo.py | 1 - vllm/model_executor/models/nemotron.py | 7 +---- vllm/model_executor/models/nemotron_nas.py | 21 +++------------ vllm/model_executor/models/olmo.py | 1 - vllm/model_executor/models/olmoe.py | 3 --- vllm/model_executor/models/openpangu.py | 19 ++----------- vllm/model_executor/models/orion.py | 7 +---- vllm/model_executor/models/ouro.py | 11 +------- vllm/model_executor/models/persimmon.py | 1 - vllm/model_executor/models/phi.py | 9 +++---- vllm/model_executor/models/phimoe.py | 13 ++++----- vllm/model_executor/models/qwen.py | 6 +---- vllm/model_executor/models/qwen2_moe.py | 7 +---- vllm/model_executor/models/qwen3.py | 11 ++------ vllm/model_executor/models/qwen3_moe.py | 7 +---- vllm/model_executor/models/seed_oss.py | 11 ++------ .../transformers_utils/configs/kimi_linear.py | 1 - vllm/transformers_utils/configs/lfm2_moe.py | 1 - vllm/transformers_utils/configs/nemotron.py | 1 - vllm/transformers_utils/configs/olmo3.py | 1 - vllm/transformers_utils/configs/qwen3_next.py | 1 - vllm/transformers_utils/configs/step3_vl.py | 1 - 39 files changed, 64 insertions(+), 243 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index ff0f07d3ce6c..bf01fc341904 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -264,7 +264,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 53e4b507c086..082caef16496 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -897,7 +897,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -973,18 +972,18 @@ def __init__( prefix=f"{prefix}.o_proj", ) - if rope_parameters: - rope_parameters["rope_type"] = "deepseek_yarn" + if config.rope_parameters: + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_parameters: - mscale_all_dim = rope_parameters.get("mscale_all_dim", False) - scaling_factor = rope_parameters["factor"] + if config.rope_parameters: + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index eb8c6313d8ba..9b5c9419874a 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import regex as re import torch @@ -142,8 +141,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -177,7 +174,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -204,8 +200,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -254,8 +249,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -289,7 +282,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -314,8 +306,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -494,14 +485,8 @@ def __init__( if isinstance(config.intermediate_size, int) else config.intermediate_size[layer_id] ) - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False @@ -520,8 +505,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, @@ -537,8 +520,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 57bcf173ceef..bc08367c5b9c 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -91,8 +91,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -120,7 +119,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.key_value_groups = int(self.num_heads / self.num_kv_heads) self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.wqkv = QKVParallelLinear( @@ -144,7 +142,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) self.attn = Attention( @@ -204,15 +201,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index d9f0c0019637..a57db82242af 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -30,15 +30,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index fb96c1621cbe..53c396d138de 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -190,9 +190,8 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, + rope_parameters: dict[str, Any], use_nope: bool = False, - rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -210,7 +209,6 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.use_nope = use_nope assert self.use_nope is True assert self.q_lora_rank is None diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index a507875adcf7..1130d939ab17 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -96,8 +96,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -126,7 +125,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -149,7 +147,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, rope_parameters=rope_parameters, is_neox_style=True, ) @@ -209,7 +206,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=config.rope_parameters["rope_theta"], rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 1d9e660651dc..06269b089348 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -189,8 +189,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -219,7 +218,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -242,7 +240,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, rope_parameters=rope_parameters, is_neox_style=True, ) @@ -303,7 +300,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=config.rope_parameters["rope_theta"], rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 547c06d310b0..f993d50d485f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -120,7 +119,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -184,9 +182,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_parameters=rope_parameters, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -256,7 +252,6 @@ def forward( def _init_rotary_emb( self, config: LlamaConfig, - rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -268,7 +263,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -317,7 +312,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=config.rope_parameters["rope_theta"], rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index d23c672d9158..4c6d1d424475 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -19,7 +19,6 @@ """Inference-only LLaMA model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -208,7 +205,6 @@ def __init__( self.floor_scale = getattr(config, "floor_scale", 8192.0) self.attn_scale = getattr(config, "attn_scale", 0.1) - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.n_rep = self.num_heads // self.num_kv_heads self.qk_norm = ( @@ -248,8 +244,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=int(rope_theta), - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) if not self.nope @@ -331,8 +326,6 @@ def __init__( self.layer_idx = extract_layer_index(prefix) self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size - rope_theta = config.rope_parameters["rope_theta"] - rope_parameters = config.rope_parameters max_position_embeddings = config.max_position_embeddings self.self_attn = Llama4Attention( @@ -340,8 +333,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 3b55a344edc8..93c642e0fd4c 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -108,7 +108,6 @@ def __init__( eos_token_id=100001, pretraining_tp=1, tie_word_embeddings=False, - rope_theta=1000000.0, rope_parameters=None, attention_bias=False, attention_dropout=0.0, @@ -162,7 +161,14 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + elif "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -336,15 +342,9 @@ def __init__( super().__init__() self.layer_idx = int(prefix.split(sep=".")[-1]) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe # Dual attention structure self.self_attn = nn.ModuleList( @@ -361,8 +361,6 @@ def __init__( config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=None diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 0892c2fe7787..7791c52808d8 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -230,8 +230,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -257,7 +256,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -281,7 +279,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) @@ -324,8 +321,6 @@ def __init__( self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_parameters = getattr(config, "rope_parameters", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -339,8 +334,7 @@ def _init_attn_block(self): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_parameters=self.rope_parameters, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index b6b998dbc58c..2d775219fc97 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -25,8 +25,6 @@ # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any - import torch from torch import nn from transformers import PretrainedConfig @@ -62,8 +60,6 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -84,7 +80,6 @@ def __init__( self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.q_a_proj = ReplicatedLinear( @@ -127,8 +122,7 @@ def __init__( self.qk_rope_head_dim, rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_local_heads, @@ -204,8 +198,6 @@ def _init_attn_block(self): v_head_dim=self.config.v_head_dim, q_lora_rank=self.config.q_lora_rank, kv_lora_rank=self.config.kv_lora_rank, - rope_theta=self.rope_theta, - rope_parameters=self.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 3d9b2fdfad51..88bcd5f8962b 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -68,8 +68,6 @@ def __init__( self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_parameters = getattr(config, "rope_parameters", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -83,8 +81,7 @@ def _init_attn_block(self): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_parameters=self.rope_parameters, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 21a6c23e300d..f0874dc36a2d 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -149,8 +149,7 @@ def __init__( num_heads: int, num_kv_heads: int, rotary_dim: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, @@ -180,7 +179,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -205,7 +203,6 @@ def __init__( self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) self.attn = Attention( @@ -252,8 +249,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = max( @@ -269,8 +264,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rotary_dim=config.rotary_dim, - rope_theta=rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index bf1ecc822756..90ae5b832a00 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -187,8 +187,8 @@ def __init__( head_dim: int, num_kv_heads: int, rotary_dim: int, + rope_parameters: dict, max_position: int = 4096 * 32, - rope_theta: float = 10000, sliding_window: int | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, @@ -214,7 +214,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.sliding_window = sliding_window self.prefix = prefix @@ -247,7 +246,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=int(rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, dtype=torch.float32, ) @@ -287,8 +286,6 @@ def __init__( self.hidden_size = config.hidden_size self.expert_num = expert_num - rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", None) if head_dim is None: head_dim = config.hidden_size // config.num_attention_heads @@ -328,7 +325,7 @@ def __init__( else head_dim, num_kv_heads=config.num_key_value_heads, max_position=max_position_embeddings, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, sliding_window=config.sliding_window, quant_config=quant_config, layer_idx=self._ilayer, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index d7a1cb82fb4f..95264184df26 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -161,7 +161,6 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -189,7 +188,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -211,7 +209,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -249,14 +247,12 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = MixtralAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 0853d8ac0257..3abd54b156f9 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -437,7 +437,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index f262326a98c5..e23f781cbb3c 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -150,8 +150,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -181,7 +180,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.max_position_embeddings = max_position_embeddings @@ -206,7 +204,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) @@ -243,7 +240,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) if rope_parameters is not None and getattr( config, "original_max_position_embeddings", None @@ -264,7 +260,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index b41da42496d0..6dfe920787cd 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -82,8 +81,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -97,8 +94,6 @@ def __init__( hidden_size, num_heads, num_kv_heads, - rope_theta, - rope_parameters, max_position_embeddings, quant_config, bias, @@ -111,7 +106,6 @@ def __init__( def _init_rotary_emb( self, config, - rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: # Enables YARN for Mistral and LLaMA4 derivatives. @@ -126,8 +120,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -148,14 +141,8 @@ def __init__( self._is_no_op_ffn = block_config.ffn.no_op self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -176,8 +163,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 7700b55d605b..43b4ad029f45 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -105,7 +105,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index dee8d790c589..6634fc85fa6b 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -123,7 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) @@ -148,7 +147,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,7 +174,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 373c1fe2f036..6578d61e1fc2 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -259,7 +259,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -274,8 +273,6 @@ def __init__( self.v_head_dim = v_head_dim self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank - self.rope_theta = rope_theta - self.tp_size = get_tensor_model_parallel_world_size() if num_heads % self.tp_size != 0: raise ValueError( @@ -353,7 +350,6 @@ def __init__( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, is_neox_style=False, ) @@ -407,8 +403,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -454,7 +448,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -475,9 +468,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_parameters=rope_parameters, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) if hasattr(config, "interleaved_sliding_window"): interleaved_sliding_window = config.interleaved_sliding_window @@ -521,7 +512,6 @@ def forward( def _init_rotary_emb( self, config: PretrainedConfig, - rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -533,8 +523,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) @@ -555,7 +544,6 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) @@ -579,7 +567,6 @@ def __init__( config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, @@ -607,8 +594,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_parameters=getattr(config, "rope_parameters", None), max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index f8da26a4b0b3..94c783ff2855 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -88,8 +88,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -115,7 +114,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -139,7 +137,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) self.attn = Attention( @@ -175,14 +172,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 00cfba00e287..63d2fff6ec8b 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -112,10 +112,8 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -140,7 +138,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config # Get total_ut_steps from config, default to 4 if not specified @@ -170,8 +167,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = nn.ModuleList() @@ -226,9 +222,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -244,10 +237,8 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_parameters=rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 1aeca4c048d8..521a1f292e28 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -138,7 +138,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, partial_rotary_factor=self.partial_rotary_factor, ) self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 8fee53c23fb4..4983b0f6c14f 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -115,16 +115,15 @@ def __init__( ) assert rotary_dim % 2 == 0 - # pylint: disable=C0301 - # Refer to: - # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - rope_theta = getattr(config, "rope_theta", 10000.0) + # Refer to https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = 10000.0 max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index e7860568b58e..b7c05569f297 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -86,7 +86,7 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters=None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=2, @@ -119,7 +119,9 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + if rope_parameters is None: + rope_theta = kwargs.pop("rope_theta", 1e6) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -304,7 +306,6 @@ def __init__( num_kv_heads: int, head_dim: int | None = None, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, rope_parameters: dict | None = None, @@ -332,7 +333,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.rope_parameters = rope_parameters self.qkv_proj = QKVParallelLinear( @@ -355,9 +355,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), - is_neox_style=True, rope_parameters=self.rope_parameters, + is_neox_style=True, ) self.attn = Attention( self.num_heads, @@ -393,7 +392,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = PhiMoEAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -402,7 +400,6 @@ def __init__( head_dim=getattr( config, "head_dim", self.hidden_size // config.num_attention_heads ), - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, rope_parameters=config.rope_parameters, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 377db922d006..efa4f044b51a 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -83,8 +83,7 @@ def __init__( hidden_size: int, num_heads: int, max_position_embeddings: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -117,7 +116,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, ) self.attn = Attention( @@ -153,13 +151,11 @@ def __init__( super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, config.max_position_embeddings, - rope_theta=rope_theta, rope_parameters=rope_parameters, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 1be2074de33f..45c6cf4ae850 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -194,8 +194,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -222,7 +221,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -248,7 +246,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) @@ -291,7 +288,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None @@ -301,7 +297,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 45ed1d63ceb9..66e5b80392f9 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -57,14 +57,13 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, max_position: int = 4096 * 32, head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -89,7 +88,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -113,7 +111,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) @@ -166,9 +163,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -187,13 +181,12 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f87de95846e8..d6a1f7a48a25 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -216,8 +216,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_parameters: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, head_dim: int | None = None, rms_norm_eps: float = 1e-06, @@ -247,7 +246,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -273,7 +271,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) @@ -326,7 +323,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) dual_chunk_attention_config = getattr( @@ -336,7 +332,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 31b1c39bf88f..5bc3886fa7ff 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -112,11 +112,10 @@ def __init__( num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -140,7 +139,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -163,7 +161,6 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, rope_parameters=rope_parameters, ) self.attn = Attention( @@ -200,9 +197,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_parameters = getattr(config, "rope_parameters", None) # By default, SeedOss uses causal attention as it is a # decoder-only model. @@ -219,10 +213,9 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py index cad8449f2443..dd994c203b0a 100644 --- a/vllm/transformers_utils/configs/kimi_linear.py +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -73,7 +73,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py index 37c038e12db8..a86766322b92 100644 --- a/vllm/transformers_utils/configs/lfm2_moe.py +++ b/vllm/transformers_utils/configs/lfm2_moe.py @@ -121,7 +121,6 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 87a7239fda57..4a3af633f0f5 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -160,7 +160,6 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py index ed0dfdcbc6f5..2a3d2bbe319f 100644 --- a/vllm/transformers_utils/configs/olmo3.py +++ b/vllm/transformers_utils/configs/olmo3.py @@ -63,7 +63,6 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index f9cf4cec438a..54e611e5f780 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -236,7 +236,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py index d3a49d7eafc3..55e3e26f93b0 100644 --- a/vllm/transformers_utils/configs/step3_vl.py +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -130,7 +130,6 @@ def __init__( self.moe_intermediate_size = moe_intermediate_size self.moe_num_experts = moe_num_experts self.moe_top_k = moe_top_k - self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters From 4127d543e88d5cb7b40e288831df7a6e5088f89d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 17:31:59 +0100 Subject: [PATCH 39/70] Remove last references to `base` arg of `get_rope` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 12 ++++-------- vllm/model_executor/models/chatglm.py | 3 ++- vllm/model_executor/models/mllama4.py | 8 ++++++-- vllm/model_executor/models/olmo2.py | 1 - 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index d445a1d69466..1d24477bbc56 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -85,10 +85,9 @@ def benchmark_mrope( tp_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 8192, - rope_theta: float = 10000, is_neox_style: bool = True, - rope_parameters: dict[str, Any] = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -102,9 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - base=rope_theta, - is_neox_style=is_neox_style, rope_parameters=rope_parameters, + is_neox_style=is_neox_style, dtype=dtype, ).to(device=device) @@ -203,7 +201,6 @@ def benchmark_mrope( num_kv_heads, head_dim, max_position, - rope_theta, is_neox_style, str(rope_parameters), str(dtype).split(".")[-1], @@ -302,8 +299,8 @@ def benchmark_mrope( head_dim = config.hidden_size // total_num_heads q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim + rope_parameters = config.rope_parameters is_neox_style = True - rope_theta = config.rope_parameters["rope_theta"] max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -315,9 +312,8 @@ def benchmark_mrope( num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_theta=rope_theta, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, - rope_parameters=config.rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 5d6f5e9125a2..dbfcd62d0bca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -99,6 +99,7 @@ def __init__( # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) + rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio} # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope @@ -106,7 +107,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim // 2, max_position=max_positions, - base=10000 * rope_ratio, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index ac234e6a47ec..bdc8d64b4021 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -286,13 +286,17 @@ def __init__( prefix=f"{prefix}.o_proj", ) + rope_parameters = { + "rope_type": "mllama4", + "rope_theta": config.rope_parameters["rope_theta"], + } + self.rotary_emb = get_rope( head_size=self.head_dim, rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, - base=config.rope_parameters["rope_theta"], - rope_parameters={"rope_type": "mllama4"}, + rope_parameters=rope_parameters, is_neox_style=False, dtype=torch.complex64, # important ) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index a9ec6b501837..ed7c3e2f0f05 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -147,7 +147,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=rope_parameters["rope_theta"], rope_parameters=rope_parameters, ) From 1ebd0e4f6efa81bbb1ccfa23ff1dfe8b98f4eff9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 17:35:00 +0100 Subject: [PATCH 40/70] Update mrope test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/kernels/core/test_mrope.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 5ccab51c2a24..2792a5e7ab0d 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -112,7 +112,6 @@ def test_mrope( ) is_neox_style = True - rope_theta = config.rope_parameters["rope_theta"] max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -121,9 +120,8 @@ def test_mrope( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, - is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, + is_neox_style=is_neox_style, dtype=dtype, ).to(device=device) @@ -185,7 +183,6 @@ def test_mrope_torch_compile_tracing( else config.hidden_size // total_num_heads ) is_neox_style = True - rope_theta = config.rope_parameters["rope_theta"] max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -194,9 +191,8 @@ def test_mrope_torch_compile_tracing( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, - is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, + is_neox_style=is_neox_style, dtype=dtype, ).to(device=device) From ec30fef4da6871bddbb3bc4ab5bce69d94889ec0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 19:32:31 +0100 Subject: [PATCH 41/70] Check everything Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 7 ++-- .../offline_inference/context_extension.py | 2 +- tests/kernels/core/test_pos_encoding.py | 21 ++++++------ .../pooling/test_nomic_max_model_len.py | 10 +++--- tests/test_config.py | 32 ++++++++++--------- vllm/config/model.py | 12 +++---- vllm/model_executor/models/apertus.py | 12 ++----- vllm/model_executor/models/arcee.py | 3 -- vllm/model_executor/models/baichuan.py | 5 ++- vllm/model_executor/models/bamba.py | 3 +- vllm/model_executor/models/chameleon.py | 18 ++--------- vllm/model_executor/models/commandr.py | 22 +++++-------- vllm/model_executor/models/config.py | 18 +++++------ vllm/model_executor/models/deepseek_v2.py | 17 ++++------ vllm/model_executor/models/exaone.py | 2 -- vllm/model_executor/models/exaone4.py | 2 -- vllm/model_executor/models/gemma3.py | 7 ++-- vllm/model_executor/models/gemma3n.py | 7 ++-- vllm/model_executor/models/glm4.py | 1 - vllm/model_executor/models/gpt_j.py | 2 ++ vllm/model_executor/models/gpt_neox.py | 2 ++ vllm/model_executor/models/granite.py | 2 -- vllm/model_executor/models/hunyuan_v1.py | 2 -- vllm/model_executor/models/kimi_linear.py | 3 -- vllm/model_executor/models/lfm2.py | 7 +--- vllm/model_executor/models/lfm2_moe.py | 7 +--- vllm/model_executor/models/llama.py | 3 -- vllm/model_executor/models/longcat_flash.py | 8 ++--- vllm/model_executor/models/mixtral.py | 1 - vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/nemotron.py | 12 +------ vllm/model_executor/models/nemotron_nas.py | 2 -- vllm/model_executor/models/olmo.py | 2 +- vllm/model_executor/models/olmo2.py | 17 +++++----- vllm/model_executor/models/olmoe.py | 3 +- vllm/model_executor/models/openpangu.py | 1 + vllm/model_executor/models/orion.py | 3 +- vllm/model_executor/models/persimmon.py | 2 +- vllm/model_executor/models/phimoe.py | 5 ++- vllm/model_executor/models/qwen.py | 3 +- vllm/model_executor/models/qwen2_moe.py | 3 +- vllm/model_executor/models/qwen3_moe.py | 3 +- vllm/model_executor/models/solar.py | 3 -- vllm/transformers_utils/config.py | 5 +++ vllm/transformers_utils/configs/arctic.py | 5 --- vllm/transformers_utils/configs/flex_olmo.py | 6 ++-- .../transformers_utils/configs/kimi_linear.py | 7 ++-- vllm/transformers_utils/configs/lfm2_moe.py | 11 +++++-- vllm/transformers_utils/configs/nemotron.py | 19 ++++++----- vllm/transformers_utils/configs/olmo3.py | 7 ++-- vllm/transformers_utils/configs/qwen3_next.py | 12 ++++--- vllm/transformers_utils/configs/step3_vl.py | 7 ++-- 52 files changed, 156 insertions(+), 222 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 1d24477bbc56..8866bfdfac88 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,9 +6,9 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_parameters, dtype, torch_mean, torch_median, -# torch_p99, torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, -# triton_max, speedup +# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, +# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, +# speedup # # == Usage Examples == # @@ -252,7 +252,6 @@ def benchmark_mrope( "num_kv_heads", "head_dim", "max_position", - "rope_theta", "is_neox_style", "rope_parameters", "dtype", diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index 0c5d7c12fb19..67d33e1881ee 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -19,8 +19,8 @@ def create_llm(): # Use yarn to extend context hf_overrides = { - "rope_theta": rope_theta, "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index ef4c0adb8d31..357d9910347d 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -74,7 +74,7 @@ def test_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: float = 10000, + rope_theta: float = 10000, ) -> None: if rotary_dim is None: rotary_dim = head_size @@ -83,7 +83,8 @@ def test_rotary_embedding( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + rope = get_rope(head_size, rotary_dim, max_position, rope_parameters, is_neox_style) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -120,7 +121,7 @@ def test_rotary_embedding( @torch.inference_mode() def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] - BASES = [10000, 1000000] + ROPE_THETAS = [10000, 1000000] ROPE_PARAMETERS = ( None, {"rope_type": "linear", "factor": (1,)}, @@ -130,7 +131,7 @@ def test_rope_module_cache(): HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, - BASES, + ROPE_THETAS, IS_NEOX_STYLE, ROPE_PARAMETERS, DTYPES, @@ -141,20 +142,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, + rope_theta, is_neox_style, rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_style, rope_parameters, + is_neox_style, dtype, ) # different settings cannot share the same rope module @@ -168,20 +169,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, + rope_theta, is_neox_style, rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_style, rope_parameters, + is_neox_style, dtype, ) # check if cache take effect diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 589a62dfd6a7..d6216a87a229 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 +from typing import Any + import pytest from ...utils import EmbedModelInfo @@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_legal(model_info, vllm_runner): hf_overrides = { - "rope_theta": rope_theta, "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_illegal(model_info, vllm_runner): - hf_overrides = { - "rope_theta": rope_theta, + hf_overrides: dict[str, Any] = { "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): pass hf_overrides = { - "rope_theta": rope_theta, "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/test_config.py b/tests/test_config.py index de36463a6f99..16f68d18fc68 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -249,46 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config(): def test_rope_customization(): - TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} - TEST_ROPE_THETA = 16_000_000.0 - LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} + TEST_ROPE_PARAMETERS = { + "rope_theta": 16_000_000.0, + "rope_type": "dynamic", + "factor": 2.0, + } + LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"} + LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0} llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct") - assert getattr(llama_model_config.hf_config, "rope_parameters", None) is None - assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000 + assert ( + getattr(llama_model_config.hf_config, "rope_parameters", None) + == LLAMA_ROPE_PARAMETERS + ) assert llama_model_config.max_model_len == 8192 llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - hf_overrides={ - "rope_parameters": TEST_ROPE_SCALING, - "rope_theta": TEST_ROPE_THETA, - }, + hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS}, ) assert ( getattr(llama_model_config.hf_config, "rope_parameters", None) - == TEST_ROPE_SCALING + == TEST_ROPE_PARAMETERS ) - assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 longchat_model_config = ModelConfig("lmsys/longchat-13b-16k") - # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config + # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config assert all( longchat_model_config.hf_config.rope_parameters.get(key) == value - for key, value in LONGCHAT_ROPE_SCALING.items() + for key, value in LONGCHAT_ROPE_PARAMETERS.items() ) assert longchat_model_config.max_model_len == 16384 longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", hf_overrides={ - "rope_parameters": TEST_ROPE_SCALING, + "rope_parameters": TEST_ROPE_PARAMETERS, }, ) assert ( getattr(longchat_model_config.hf_config, "rope_parameters", None) - == TEST_ROPE_SCALING + == TEST_ROPE_PARAMETERS ) assert longchat_model_config.max_model_len == 4096 diff --git a/vllm/config/model.py b/vllm/config/model.py index 4a2b1e1465ce..0235604459f2 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2069,15 +2069,15 @@ def _get_and_verify_max_len( ) derived_max_model_len = default_max_len + # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. + # To simplify the verification, we convert it to dict[str, TypedDict]. rope_parameters = getattr(hf_config, "rope_parameters", None) + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters = {"": rope_parameters} + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # scaling, so we skip applying the scaling factor again. if rope_parameters is not None and "gemma3" not in hf_config.model_type: - # In Transformers v5 rope_parameters could be RopeParameters or - # dict[str, RopeParameters] where RopeParameters is a TypedDict. To simplify - # the verification, we convert any RopeParameters to dict[str, RopeParameters] - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): - rope_parameters = {"": rope_parameters} scaling_factor = 1.0 for rp in rope_parameters.values(): # No need to consider "type" key because of patch_rope_parameters when @@ -2099,7 +2099,7 @@ def _get_and_verify_max_len( if rope_type == "yarn": derived_max_model_len = rp["original_max_position_embeddings"] - # Do this outside loop since all layers should have the same scaling + # Do this outside loop since all layer types should have the same scaling derived_max_model_len *= scaling_factor if encoder_config and "max_seq_length" in encoder_config: diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index d76d03427364..b75e91319bba 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -118,7 +117,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -174,9 +172,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_parameters=rope_parameters, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -222,7 +218,6 @@ def forward( def _init_rotary_emb( self, config: ApertusConfig, - rope_parameters: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -234,7 +229,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=int(self.partial_rotary_factor * self.head_dim), max_position=self.max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -250,8 +245,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -279,7 +272,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 49b81a5ea0c3..b3887b16f4d7 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -103,8 +103,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Determine if attention bias is needed (some variants use bias terms) attention_bias = getattr(config, "attention_bias", False) or getattr( @@ -126,7 +124,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index af21e41c8763..edf47270e527 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -136,7 +136,7 @@ def __init__( hidden_size: int, num_heads: int, position_embedding: str, - rope_parameters: dict | None = None, + rope_parameters: dict, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -228,13 +228,12 @@ def __init__( ): super().__init__() self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = BaiChuanAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, position_embedding=position_embedding, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 53fdc30cda1c..16648929c577 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -157,7 +157,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -191,7 +190,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=torch.get_default_dtype(), # see impl of get_rope ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index bf01fc341904..792c15ba4ff8 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -365,13 +365,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -380,7 +373,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, @@ -433,13 +426,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -448,7 +434,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 73ed86c87908..bffbecc9bf09 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -173,28 +173,22 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + rope_parameters=config.rope_scaling, + is_neox_style=False, + ) # Model v2 has interleaved sliding windows, v1 does not self.v1 = isinstance(config, CohereConfig) self.sliding_window = None - rope_parameters = config.rope_parameters if not self.v1: layer_idx = extract_layer_index(prefix) - layer_type = config.layer_types[layer_idx] - if layer_type == "sliding_attention": + if config.layer_types[layer_idx] == "sliding_attention": self.sliding_window = config.sliding_window - if layer_type in rope_parameters: - # Transformers v5 - rope_parameters = rope_parameters[layer_type] - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - rope_parameters=rope_parameters, - is_neox_style=False, - ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index c5e9c876d68b..01e17280e0bc 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -77,16 +77,14 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if not model_config.enforce_eager: max_position = round_up(max_position, 8) - rope_parameters = getattr(config, "rope_parameters", None) or {} - rope_theta = getattr(config, "rope_theta", config.rotary_emb_base) - rope_parameters["rope_theta"] = rope_theta + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = config.rotary_emb_base config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, - "base": rope_parameters["rope_theta"], - "rope_parameters": rope_parameters, + "rope_parameters": config.rope_parameters, } @@ -120,15 +118,15 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) - rope_parameters = getattr(config, "rope_parameters", None) or {} - rope_theta = getattr(config, "rope_theta", config.rotary_emb_base) - rope_parameters["rope_theta"] = rope_theta + + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = config.rotary_emb_base + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, - "base": rope_parameters["rope_theta"], - "rope_parameters": rope_parameters, + "rope_parameters": config.rope_parameters, } # we ignore config.rotary_scaling_factor so that for datasets shorter diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 082caef16496..41f8a4334145 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -111,7 +110,6 @@ def __init__( config: DeepseekV2Config | DeepseekV3Config, hidden_size: int, num_heads: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -160,7 +158,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -406,7 +404,6 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -480,20 +477,20 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - if rope_parameters: - rope_parameters["rope_type"] = "deepseek_yarn" + if config.rope_parameters: + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_parameters: - mscale_all_dim = rope_parameters.get("mscale_all_dim", False) - scaling_factor = rope_parameters["factor"] + if config.rope_parameters: + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 1727775d452a..d13275488fe9 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -242,8 +242,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 5c74b7c1230e..e851fafccad8 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -222,8 +222,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index d3d26bc85c5e..32b21d796b33 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -160,12 +160,11 @@ def __init__( sliding_window = config.sliding_window if self.is_sliding else None # Initialize the rotary embedding. - if config.rope_parameters and layer_type in config.rope_parameters: - # Transformers v5 + if layer_type in config.rope_parameters: + # Transformers v5 rope config. rope_parameters = config.rope_parameters[layer_type] else: - # Transformers v4 - + # Transformers v4 rope config. # Global attention. Use the values in config.json. rope_parameters = config.rope_parameters # Local attention. Override the values in config.json. diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index d66bf6014615..f72e706a0a72 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -337,12 +337,11 @@ def __init__( self.sliding_window = config.sliding_window if is_sliding else None # Initialize the rotary embedding. - if config.rope_parameters and layer_type in config.rope_parameters: - # Transformers v5 + if layer_type in config.rope_parameters: + # Transformers v5 rope config. rope_parameters = config.rope_parameters[layer_type] else: - # Transformers v4 - + # Transformers v4 rope config. # Global attention. Use the values in config.json. rope_parameters = config.rope_parameters # Local attention. Override the values in config.json. diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 42276f848e8b..f8ef3b0385fb 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -59,7 +59,6 @@ def __init__( qkv_bias: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_parameters: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index e94de8952fa6..762cd0c1f4d2 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,6 +95,8 @@ def __init__( scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = 10000 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 815c2fba4d9f..ccec9e556493 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,6 +92,8 @@ def __init__( scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = 10000 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index abc7d18edb10..1dc205b47753 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -199,8 +199,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 9b5c9419874a..9fa5e2bd33f2 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -485,8 +485,6 @@ def __init__( if isinstance(config.intermediate_size, int) else config.intermediate_size[layer_id] ) - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index 53c396d138de..4562b2202c5e 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -190,7 +189,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_parameters: dict[str, Any], use_nope: bool = False, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -212,7 +210,6 @@ def __init__( self.use_nope = use_nope assert self.use_nope is True assert self.q_lora_rank is None - assert rope_parameters is None assert num_heads % tp_size == 0 self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 1130d939ab17..74bdde27ece5 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -96,7 +95,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -147,7 +145,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -196,8 +194,6 @@ def __init__( self.config = config self.layer_idx = layer_idx - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2Attention( @@ -206,7 +202,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 06269b089348..c088a0821152 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -189,7 +188,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -240,7 +238,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -290,8 +288,6 @@ def __init__( self.config = config self.layer_idx = layer_idx - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2MoeAttention( @@ -300,7 +296,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f993d50d485f..0fa9d16490c6 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -283,8 +283,6 @@ def __init__( quant_config = self.get_quant_config(vllm_config) self.hidden_size = config.hidden_size - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -312,7 +310,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 93c642e0fd4c..fafe97cd2be7 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -163,11 +163,9 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} rope_theta = kwargs.pop("rope_theta", 1000000.0) - if rope_parameters is None: - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - elif "rope_theta" not in rope_parameters: + if "rope_theta" not in rope_parameters: rope_parameters["rope_theta"] = rope_theta self.rope_parameters = rope_parameters self.attention_bias = attention_bias @@ -343,8 +341,6 @@ def __init__( self.layer_idx = int(prefix.split(sep=".")[-1]) self.hidden_size = config.hidden_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe # Dual attention structure self.self_attn = nn.ModuleList( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 95264184df26..54ab8dd493e7 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -246,7 +246,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 self.self_attn = MixtralAttention( config=config, hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 3abd54b156f9..dc06938d5d6e 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -410,7 +410,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_parameters["rope_theta"] # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -437,6 +436,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index e23f781cbb3c..c3337bd1ea69 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -150,7 +149,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -204,7 +202,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( @@ -240,13 +238,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) - if rope_parameters is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_parameters["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -260,7 +251,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 6dfe920787cd..2eebe38051cb 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -141,8 +141,6 @@ def __init__( self._is_no_op_ffn = block_config.ffn.no_op self.hidden_size = config.hidden_size - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 43b4ad029f45..bd8a8e317544 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -87,7 +87,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_parameters["rope_theta"] self.clip_qkv = config.clip_qkv # Attention input projection. Projects x -> (q, k, v) @@ -105,6 +104,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index ed7c3e2f0f05..4f20b8213117 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -122,14 +122,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer_idx = extract_layer_index(prefix) sliding_window = None - rope_parameters = {"rope_theta": self.config.rope_parameters["rope_theta"]} - if layer_types := getattr(self.config, "layer_types", None): - layer_type = layer_types[layer_idx] - if layer_type == "sliding_attention": - sliding_window = self.config.sliding_window - elif layer_type == "full_attention": - # Rope scaling is only applied on full attention layers. - rope_parameters.update(self.config.rope_parameters) + if ( + layer_types := getattr(self.config, "layer_types", None) + ) is not None and layer_types[layer_idx] == "sliding_attention": + sliding_window = self.config.sliding_window self.attn = Attention( self.num_heads, @@ -143,11 +139,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) # Rotary embeddings. + # TODO: Rope scaling should only be applied on full attention layers. Original + # implementation was setting the rope_parameters to None for sliding attention + # layers, but that does not disable rope scaling in vLLM. self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=self.config.rope_parameters, ) # Attention output projection. diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 6634fc85fa6b..c39e338d72e2 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -123,7 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) num_heads = config.num_attention_heads @@ -174,7 +173,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 6578d61e1fc2..2cbfb13a5678 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -337,6 +337,7 @@ def __init__( # TODO: remove hard coding rope_parameters = { + "rope_theta": config.rope_parameters.get("rope_theta", 10000), "beta_fast": 32, "beta_slow": 1, "factor": 1, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 94c783ff2855..07cd2357a49e 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -172,13 +172,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 521a1f292e28..98963d52e484 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -106,7 +106,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_parameters["rope_theta"] self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True @@ -138,6 +137,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index b7c05569f297..8ffac95d9396 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -304,11 +304,11 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, head_dim: int | None = None, max_position: int = 4096 * 32, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_parameters: dict | None = None, prefix: str = "", ) -> None: super().__init__() @@ -333,7 +333,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_parameters = rope_parameters self.qkv_proj = QKVParallelLinear( hidden_size, @@ -355,7 +354,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - rope_parameters=self.rope_parameters, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index efa4f044b51a..6906a73579d1 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -151,12 +151,11 @@ def __init__( super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - rope_parameters = getattr(config, "rope_parameters", None) self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, config.max_position_embeddings, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 45c6cf4ae850..5687897719b1 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -288,7 +288,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -297,7 +296,7 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index d6a1f7a48a25..8ee3dd99e11d 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -323,7 +323,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_parameters = getattr(config, "rope_parameters", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None @@ -332,7 +331,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_parameters=rope_parameters, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 0ea25d96beea..7e9fc51036d2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -197,9 +197,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 8f02aaa10a5b..de5d2df55958 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -411,6 +411,11 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: if rope_theta is not None: rope_parameters = rope_parameters or {"rope_type": "default"} rope_parameters["rope_theta"] = rope_theta + # Add original_max_position_embeddings if present + if rope_parameters and ( + ompe := getattr(config, "original_max_position_embeddings", None) + ): + rope_parameters["original_max_position_embeddings"] = ompe # Write back to text_config text_config.rope_parameters = rope_parameters # Delete legacy attributes diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 480d0c4ea062..5c120cc44c03 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -165,14 +165,9 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters rope_theta = kwargs.pop("rope_theta", 1e6) if rope_parameters is None: rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - elif "rope_theta" not in rope_parameters: - rope_parameters["rope_theta"] = rope_theta self.rope_parameters = rope_parameters self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py index 1d2be4cd8af8..c343dc0999a8 100644 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ b/vllm/transformers_utils/configs/flex_olmo.py @@ -64,11 +64,9 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} rope_theta = kwargs.pop("rope_theta", 500000.0) - if rope_parameters is None: - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - elif "rope_theta" not in rope_parameters: + if "rope_theta" not in rope_parameters: rope_parameters["rope_theta"] = rope_theta self.rope_parameters = rope_parameters self.attention_bias = attention_bias diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py index dd994c203b0a..14894816801d 100644 --- a/vllm/transformers_utils/configs/kimi_linear.py +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -29,7 +29,6 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - rope_theta=10000.0, rope_parameters=None, tie_word_embeddings=False, moe_intermediate_size: int | None = None, @@ -75,7 +74,11 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py index a86766322b92..b399a03c030f 100644 --- a/vllm/transformers_utils/configs/lfm2_moe.py +++ b/vllm/transformers_utils/configs/lfm2_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. max_position_embeddings (`int`, *optional*, defaults to 128000): The maximum sequence length that this model might ever be used with. use_cache (`bool`, *optional*, defaults to `True`): @@ -100,7 +101,7 @@ def __init__( bos_token_id: int = 1, eos_token_id: int = 2, tie_word_embeddings: bool = True, - rope_theta: float = 1000000.0, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 128_000, use_cache: bool = True, norm_eps: float = 0.00001, @@ -121,6 +122,10 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 4a3af633f0f5..3b535979ade0 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): @@ -132,7 +132,6 @@ def __init__( bos_token_id=2, eos_token_id=3, tie_word_embeddings=False, - rope_theta=10000.0, rope_parameters=None, partial_rotary_factor=0.5, attention_bias=False, @@ -162,7 +161,11 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters # for backward compatibility partial_rotary_factor = ( kwargs.get("rope_percent") @@ -190,12 +193,12 @@ def _rope_parameters_validation(self): if self.rope_parameters is None: return - if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: + if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 3: raise ValueError( - "`rope_parameters` must be a dictionary with two fields, " - f"`type` and `factor`, got {self.rope_parameters}" + "`rope_parameters` must be a dictionary with three fields, " + f"`rope_theta`, `rope_type` and `factor`, got {self.rope_parameters}" ) - rope_parameters_type = self.rope_parameters.get("type", None) + rope_parameters_type = self.rope_parameters.get("rope_type", None) rope_parameters_factor = self.rope_parameters.get("factor", None) if rope_parameters_type is None or rope_parameters_type not in [ "linear", diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py index 2a3d2bbe319f..c4691b661af3 100644 --- a/vllm/transformers_utils/configs/olmo3.py +++ b/vllm/transformers_utils/configs/olmo3.py @@ -24,7 +24,6 @@ def __init__( bos_token_id=None, eos_token_id=50279, tie_word_embeddings=False, - rope_theta=10000.0, rope_parameters=None, attention_bias=False, attention_dropout=0.0, @@ -65,7 +64,11 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index 54e611e5f780..d2fe58d48da6 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_parameters (`Dict`, *optional*): + rope_parameters (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. `rope_type` (`str`): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. @@ -199,7 +198,6 @@ def __init__( rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, rope_parameters=None, partial_rotary_factor=0.25, attention_bias=False, @@ -238,7 +236,11 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py index 55e3e26f93b0..0ee650a70451 100644 --- a/vllm/transformers_utils/configs/step3_vl.py +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -52,7 +52,6 @@ def __init__( moe_intermediate_size: int = 5120, moe_num_experts: int = 48, moe_top_k: int = 3, - rope_theta: float = 500000, rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 65536, share_expert_dim: int = 5120, @@ -132,7 +131,11 @@ def __init__( self.moe_top_k = moe_top_k # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.max_position_embedding = max_position_embedding self.share_expert_dim = share_expert_dim self.share_q_dim = share_q_dim From 6368078dfdba9b2ba93130f193df877e33804ca2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 13 Nov 2025 20:12:25 +0100 Subject: [PATCH 42/70] fix Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 0235604459f2..ab9e157b820a 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2072,7 +2072,9 @@ def _get_and_verify_max_len( # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. # To simplify the verification, we convert it to dict[str, TypedDict]. rope_parameters = getattr(hf_config, "rope_parameters", None) - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if rope_parameters and not set(rope_parameters.keys()).issubset( + ALLOWED_LAYER_TYPES + ): rope_parameters = {"": rope_parameters} # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE From d4b2fbb734b862347774df81fc6f156417a6d789 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:41:51 +0100 Subject: [PATCH 43/70] Don't delete the legacy attributes when still using v4 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index de5d2df55958..20568f73a6a8 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -418,11 +418,6 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: rope_parameters["original_max_position_embeddings"] = ompe # Write back to text_config text_config.rope_parameters = rope_parameters - # Delete legacy attributes - if hasattr(text_config, "rope_theta"): - delattr(text_config, "rope_theta") - if hasattr(text_config, "rope_scaling"): - delattr(text_config, "rope_scaling") # No RoPE parameters to patch if rope_parameters is None: From 1e68d271cb10d7bd2aab256f36f59801ccda7eb6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:45:26 +0100 Subject: [PATCH 44/70] Fix typo in commandr Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/commandr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index bffbecc9bf09..5ed920927c77 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -177,7 +177,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=config.rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) From db6a8806577b7e6725293df9a8f24001cfe0177e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:48:20 +0100 Subject: [PATCH 45/70] Fix typo in deepseek v2 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/deepseek_v2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 41f8a4334145..33c630f34590 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1092,7 +1092,6 @@ def __init__( v_head_dim=v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=kv_lora_rank, - rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, From 26a51d4585e6fa3177bacf87efdc00a77cf8678a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 13:05:00 +0100 Subject: [PATCH 46/70] Handle multimodal models where vision model uses RoPE Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 20568f73a6a8..62201b9b698f 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -394,18 +394,22 @@ def file_or_path_exists( def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - text_config = config.get_text_config() - + # Handle nested configs (e.g., multi-modal models) + if sub_configs := getattr(config, "sub_configs", None): + for sub_config in sub_configs: + patch_rope_parameters(getattr(config, sub_config)) + return + # Retrieve rope_parameters differently based on Transformers version if Version(version("transformers")) >= Version("5.0.0.dev0"): from transformers.modeling_rope_utils import RopeParameters rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( - text_config, "rope_parameters", None + config, "rope_parameters", None ) else: # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters - rope_theta: float | None = getattr(text_config, "rope_theta", None) - rope_scaling: dict | None = getattr(text_config, "rope_scaling", None) + rope_theta: float | None = getattr(config, "rope_theta", None) + rope_scaling: dict | None = getattr(config, "rope_scaling", None) rope_parameters = rope_scaling # Move rope_theta into rope_parameters if rope_theta is not None: @@ -416,8 +420,8 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: ompe := getattr(config, "original_max_position_embeddings", None) ): rope_parameters["original_max_position_embeddings"] = ompe - # Write back to text_config - text_config.rope_parameters = rope_parameters + # Write back to config + config.rope_parameters = rope_parameters # No RoPE parameters to patch if rope_parameters is None: From dd6924481e574054be82283f0f929e151ab1155e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 13:06:52 +0100 Subject: [PATCH 47/70] Use new default value of rope_parameters in kernels test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/kernels/core/test_pos_encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 357d9910347d..9ce7d508126f 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -123,7 +123,7 @@ def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] ROPE_THETAS = [10000, 1000000] ROPE_PARAMETERS = ( - None, + {"rope_type": "default"}, {"rope_type": "linear", "factor": (1,)}, {"rope_type": "dynamic", "factor": 1}, ) From 132dc4b4f9fcf2f5c6d9efff33152159099648f0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 13:09:14 +0100 Subject: [PATCH 48/70] Use `rope_parameters` instead of `base` in compile test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/compile/test_functionalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 11ae96e930da..515e0a93ac2a 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -137,7 +137,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000): self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, q, k): @@ -172,7 +172,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, hidden_states): From d7a6ded3454824efcfdb2185a2463768522a45d0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 14:55:37 +0100 Subject: [PATCH 49/70] Don't overwrite main config for v4 style Gemma 3 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gemma3.py | 2 +- vllm/model_executor/models/gemma3n.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 32b21d796b33..565719ae7fae 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -166,7 +166,7 @@ def __init__( else: # Transformers v4 rope config. # Global attention. Use the values in config.json. - rope_parameters = config.rope_parameters + rope_parameters = config.rope_parameters.copy() # Local attention. Override the values in config.json. if self.is_sliding: rope_parameters["rope_theta"] = config.rope_local_base_freq diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index f72e706a0a72..8f1447ba34a8 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -343,7 +343,7 @@ def __init__( else: # Transformers v4 rope config. # Global attention. Use the values in config.json. - rope_parameters = config.rope_parameters + rope_parameters = config.rope_parameters.copy() # Local attention. Override the values in config.json. if is_sliding: rope_parameters["rope_theta"] = config.rope_local_base_freq From 8ceffd6fb2931615021d305e140a8ed458547785 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 15:04:52 +0100 Subject: [PATCH 50/70] Only raise for `disable_sliding_window` if the model actually has `sliding_window` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index aa97ff374504..b9badf5f52cb 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2096,7 +2096,7 @@ def _get_and_verify_max_len( rope_type = rp["rope_type"] if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: + if disable_sliding_window and sliding_window is not None: # TODO(robertgshaw): Find a model that supports rope_parameters # with sliding window to see if this case should be allowed. raise NotImplementedError( @@ -2141,7 +2141,7 @@ def _get_and_verify_max_len( # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window: + if disable_sliding_window and sliding_window is not None: # TODO(robertgshaw): Find a model that has model_max_length # with sliding window to see if this case should be allowed. raise NotImplementedError( From 08126a9a5974a96653bb6661bc5f2aa779c5c2ef Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 15:36:57 +0100 Subject: [PATCH 51/70] Fix arctic config docstring for docs Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/configs/arctic.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 5c120cc44c03..ba4b1a8f701f 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): From f1c3c33cc2d90a26cdcbf3e3bbba83e91c360e71 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:15:57 +0100 Subject: [PATCH 52/70] Fix typo in gpt-oss Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 33daf407435d..faedc5ae9373 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -69,8 +69,8 @@ def __init__( max_position=config.max_position_embeddings, dtype=torch.float32, rope_parameters={ + "rope_theta": config.rope_parameters["rope_theta"], "rope_type": "yarn", - "rope_base": config.rope_parameters["rope_theta"], "factor": config.rope_parameters["factor"], "original_max_position_embeddings": config.rope_parameters[ "original_max_position_embeddings" From a2601ce06ef33e832e7934d7b6912c07143f1823 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:56:18 +0100 Subject: [PATCH 53/70] Remove disable_sliding_window errors Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index b9badf5f52cb..8131cb813b37 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2096,14 +2096,6 @@ def _get_and_verify_max_len( rope_type = rp["rope_type"] if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window and sliding_window is not None: - # TODO(robertgshaw): Find a model that supports rope_parameters - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models with " - "rope_parameters. Please raise an issue so we can investigate." - ) - # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py # NOTE: This assumes all layer types have the same scaling factor. scaling_factor = rp.get("factor", scaling_factor) @@ -2140,16 +2132,7 @@ def _get_and_verify_max_len( # that will be bigger than derived_max_model_len. We compare user input # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window and sliding_window is not None: - # TODO(robertgshaw): Find a model that has model_max_length - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "model_max_length in the config. Please raise an issue " - "so we can investigate." - ) - else: + if model_max_length is None and max_model_len > model_max_length: msg = ( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" From 03d50e06d6d38bf22ed22df460640298172b390d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:59:45 +0100 Subject: [PATCH 54/70] Fix olmo2 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/olmo2.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 4f20b8213117..f0f6b2f6b3e6 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -138,15 +138,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=f"{prefix}.attn", ) - # Rotary embeddings. - # TODO: Rope scaling should only be applied on full attention layers. Original - # implementation was setting the rope_parameters to None for sliding attention - # layers, but that does not disable rope scaling in vLLM. + # Rotary embeddings. Rope scaling is only applied on full attention layers. + if sliding_window is None: + rope_parameters = self.config.rope_parameters + else: + rope_theta = self.config.rope_parameters["rope_theta"] + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=self.config.rope_parameters, + rope_parameters=rope_parameters, ) # Attention output projection. From 93827b646dea63f78189923257576ad6fffa726f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 18:13:24 +0100 Subject: [PATCH 55/70] Fix custom code mm models Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 62201b9b698f..e6b463093210 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -399,6 +399,10 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: for sub_config in sub_configs: patch_rope_parameters(getattr(config, sub_config)) return + else: + # Some custom multi-modal configs don't use sub_configs.So we get the + # text config and assume that there is no RoPE in other modalities. + config = config.get_text_config() # Retrieve rope_parameters differently based on Transformers version if Version(version("transformers")) >= Version("5.0.0.dev0"): from transformers.modeling_rope_utils import RopeParameters From 3b3c23361fec0d048e9a9c3f48ba8a96de23696b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 18:19:20 +0100 Subject: [PATCH 56/70] Fix models with no rope info at all in their `config.json` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/config.py | 4 ++++ vllm/model_executor/models/gpt_j.py | 2 ++ vllm/model_executor/models/gpt_neox.py | 2 ++ vllm/model_executor/models/grok1.py | 5 +++-- vllm/model_executor/models/phi.py | 4 +++- vllm/model_executor/models/plamo2.py | 2 ++ 6 files changed, 16 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 01e17280e0bc..a1323839ca75 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -77,6 +77,8 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if not model_config.enforce_eager: max_position = round_up(max_position, 8) + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: config.rope_parameters["rope_theta"] = config.rotary_emb_base @@ -119,6 +121,8 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: config.rope_parameters["rope_theta"] = config.rotary_emb_base diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 762cd0c1f4d2..fd42b52b4e65 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,6 +95,8 @@ def __init__( scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: config.rope_parameters["rope_theta"] = 10000 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index ccec9e556493..d3053a5c99ed 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,6 +92,8 @@ def __init__( scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: config.rope_parameters["rope_theta"] = 10000 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 545b8f13988e..573ab762a41c 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -234,9 +234,10 @@ def __init__( if not self.use_fp8 and hasattr(quant_config, "is_fp8"): self.use_fp8 = quant_config.is_fp8 - # Requires transformers > 4.32.0 - # Default rope_theta value if not in config + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: + # Default rope_theta value if not in config config.rope_parameters["rope_theta"] = 10000 self.attn = Grok1Attention( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 4983b0f6c14f..e62621aeac4e 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -115,8 +115,10 @@ def __init__( ) assert rotary_dim % 2 == 0 - # Refer to https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: + # Refer to https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 config.rope_parameters["rope_theta"] = 10000.0 max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 6de7f71bd2c3..6fdf9e4105b9 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -576,6 +576,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No prefix=f"{prefix}.o_proj", ) + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} if "rope_theta" not in config.rope_parameters: config.rope_parameters["rope_theta"] = 10000 From 3f9ce07456a9a38a2c391fa161e1015e220aab3c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Nov 2025 19:19:58 +0100 Subject: [PATCH 57/70] Fix unaccounted for style of config Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index e6b463093210..d3b5309214af 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -9,7 +9,7 @@ from functools import cache, partial from importlib.metadata import version from pathlib import Path -from typing import Any, Literal, TypeVar +from typing import Any, Literal, TypeAlias, TypeVar import huggingface_hub from huggingface_hub import ( @@ -394,15 +394,6 @@ def file_or_path_exists( def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - # Handle nested configs (e.g., multi-modal models) - if sub_configs := getattr(config, "sub_configs", None): - for sub_config in sub_configs: - patch_rope_parameters(getattr(config, sub_config)) - return - else: - # Some custom multi-modal configs don't use sub_configs.So we get the - # text config and assume that there is no RoPE in other modalities. - config = config.get_text_config() # Retrieve rope_parameters differently based on Transformers version if Version(version("transformers")) >= Version("5.0.0.dev0"): from transformers.modeling_rope_utils import RopeParameters @@ -414,6 +405,18 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters rope_theta: float | None = getattr(config, "rope_theta", None) rope_scaling: dict | None = getattr(config, "rope_scaling", None) + # Make best effort to retrieve parameters for multi-modal configs + if rope_theta is None and rope_scaling is None: + SubConfigs: TypeAlias = dict[str, PretrainedConfig] + sub_configs: SubConfigs | None = getattr(config, "sub_configs", None) + if sub_configs: + for sub_config in sub_configs: + patch_rope_parameters(getattr(config, sub_config)) + return + # Not all multi-modal configs use sub_configs + config = config.get_text_config() + rope_theta = getattr(config, "rope_theta", None) + rope_scaling = getattr(config, "rope_scaling", None) rope_parameters = rope_scaling # Move rope_theta into rope_parameters if rope_theta is not None: From f1714ac02360e33283c79979b1cef0fb3ebea527 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:19:53 +0100 Subject: [PATCH 58/70] Hopefully final fix for multimodal rope overrides Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d3b5309214af..2b8201dbc6a3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -401,22 +401,14 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( config, "rope_parameters", None ) + elif hasattr(config, "rope_parameters"): + # We are in Transformers v4 and rope_parameters + # has already been patched for this config + return else: # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters rope_theta: float | None = getattr(config, "rope_theta", None) rope_scaling: dict | None = getattr(config, "rope_scaling", None) - # Make best effort to retrieve parameters for multi-modal configs - if rope_theta is None and rope_scaling is None: - SubConfigs: TypeAlias = dict[str, PretrainedConfig] - sub_configs: SubConfigs | None = getattr(config, "sub_configs", None) - if sub_configs: - for sub_config in sub_configs: - patch_rope_parameters(getattr(config, sub_config)) - return - # Not all multi-modal configs use sub_configs - config = config.get_text_config() - rope_theta = getattr(config, "rope_theta", None) - rope_scaling = getattr(config, "rope_scaling", None) rope_parameters = rope_scaling # Move rope_theta into rope_parameters if rope_theta is not None: @@ -720,7 +712,14 @@ def get_config( logger.debug("Overriding HF config with %s", hf_overrides_fn) config = hf_overrides_fn(config) + # Exhaustively patch RoPE parameters everywhere they might be patch_rope_parameters(config) + patch_rope_parameters(config.get_text_config()) + SubConfigs: TypeAlias = dict[str, PretrainedConfig] + sub_configs: SubConfigs | None = getattr(config, "sub_configs", None) + if sub_configs: + for sub_config in sub_configs: + patch_rope_parameters(getattr(config, sub_config)) if trust_remote_code: maybe_register_config_serialize_by_value() From 981aac45c7026b89846753bf8b27d458dc6c1e09 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:20:05 +0100 Subject: [PATCH 59/70] Fix condition for raising error Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 8131cb813b37..bebb42b92717 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2132,7 +2132,7 @@ def _get_and_verify_max_len( # that will be bigger than derived_max_model_len. We compare user input # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is None and max_model_len > model_max_length: + if model_max_length is None or max_model_len > model_max_length: msg = ( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" From 5c2f394e804112a758fda0811ea945c7414ceaf7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:31:51 +0100 Subject: [PATCH 60/70] Only override `rope_type` to `deepseek_yarn` if it was not `default` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/deepseek_v2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 33c630f34590..5b0eee98e563 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -477,7 +477,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - if config.rope_parameters: + if config.rope_parameters["rope_type"] != "default": config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( @@ -488,7 +488,7 @@ def __init__( is_neox_style=False, ) - if config.rope_parameters: + if config.rope_parameters["rope_type"] != "default": mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) @@ -969,7 +969,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - if config.rope_parameters: + if config.rope_parameters["rope_type"] != "default": config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, @@ -978,7 +978,7 @@ def __init__( rope_parameters=config.rope_parameters, is_neox_style=False, ) - if config.rope_parameters: + if config.rope_parameters["rope_type"] != "default": mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) From 6c64ba51f86f172f129732f7a652ec6c395d749a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:18:20 +0100 Subject: [PATCH 61/70] Make 10000 the default base for `get_rope` if `rope_parameters == None` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../layers/rotary_embedding/__init__.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 30b4c1116896..ae8a7d93b50e 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -26,19 +26,23 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - rope_parameters: dict[str, Any], is_neox_style: bool = True, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() - # Transforms every value that is a list into a tuple for caching calls - rope_parameters_tuple = { - k: tuple(v) if isinstance(v, list) else v for k, v in rope_parameters.items() - } - rope_parameters_args = tuple(rope_parameters_tuple.items()) + if rope_parameters is not None: + # Transforms every value that is a list into a tuple for caching calls + rope_parameters_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in rope_parameters.items() + } + rope_parameters_args = tuple(rope_parameters_tuple.items()) + else: + rope_parameters_args = None if dual_chunk_attention_config is not None: dual_chunk_attention_tuple = { @@ -56,15 +60,15 @@ def get_rope( head_size, rotary_dim, max_position, - rope_parameters_args, is_neox_style, + rope_parameters_args, dual_chunk_attention_args, dtype, ) if key in _ROPE_DICT: return _ROPE_DICT[key] - base = rope_parameters["rope_theta"] + base = rope_parameters["rope_theta"] if rope_parameters else 10000 if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -80,6 +84,10 @@ def get_rope( dtype, **extra_kwargs, ) + elif not rope_parameters: + rotary_emb = RotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, dtype + ) else: scaling_type = rope_parameters["rope_type"] From 6beee2b421a0236da7a0cd0512bf459dfadf4bee Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:20:04 +0100 Subject: [PATCH 62/70] Set all model defaults which are not 10000 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/config.py | 11 +++-------- vllm/model_executor/models/ernie45_moe.py | 2 ++ vllm/model_executor/models/ernie45_vl_moe.py | 2 ++ vllm/model_executor/models/exaone4.py | 2 ++ vllm/model_executor/models/falcon_h1.py | 2 ++ vllm/model_executor/models/openpangu.py | 4 +++- vllm/model_executor/models/qwen2.py | 4 ++-- vllm/model_executor/models/qwen3.py | 2 ++ vllm/model_executor/models/seed_oss.py | 2 ++ vllm/transformers_utils/config.py | 9 +++++++++ 10 files changed, 29 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index a1323839ca75..3cf4bf991e66 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -8,6 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform +from vllm.transformers_utils.config import set_default_rope_theta from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec @@ -77,10 +78,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if not model_config.enforce_eager: max_position = round_up(max_position, 8) - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - config.rope_parameters["rope_theta"] = config.rotary_emb_base + set_default_rope_theta(config, default_theta=config.rotary_emb_base) config.rotary_kwargs = { "head_size": head_dim, @@ -121,10 +119,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - config.rope_parameters["rope_theta"] = config.rotary_emb_base + set_default_rope_theta(config, default_theta=config.rotary_emb_base) config.rotary_kwargs = { "head_size": head_dim, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 9da7a9c425ba..a7df3509e3ec 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -62,6 +62,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( @@ -330,6 +331,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size + set_default_rope_theta(config, default_theta=500000) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) self.self_attn = Ernie4_5_MoeAttention( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index f609e0187d32..5b770bfbaf8e 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -58,6 +58,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .ernie45_moe import Ernie4_5_MoeMLP from .interfaces import SupportsPP @@ -411,6 +412,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size + set_default_rope_theta(config, default_theta=500000) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index e851fafccad8..70f3cce2b7c5 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -51,6 +51,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -172,6 +173,7 @@ def __init__( # apply rotary embeddings to every layer in full attention models self.apply_rope_all_layers = "sliding_attention" not in config.layer_types + set_default_rope_theta(config, default_theta=1000000) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 8dba76180005..f661cbe19f17 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -35,6 +35,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import ( HasInnerState, @@ -216,6 +217,7 @@ def __init__( prefix: str = "", ) -> None: super().__init__() + set_default_rope_theta(config, default_theta=1e11) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 2cbfb13a5678..f814cdfec5a2 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -77,6 +77,7 @@ sequence_parallel_chunk, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta def check_ffn_act_fn(act_fn: str): @@ -336,8 +337,9 @@ def __init__( ) # TODO: remove hard coding + set_default_rope_theta(config, default_theta=10000) rope_parameters = { - "rope_theta": config.rope_parameters.get("rope_theta", 10000), + "rope_theta": config.rope_parameters["rope_theta"], "beta_fast": 32, "beta_slow": 1, "factor": 1, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 5a0654fb7383..32b6d6dd07b8 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -57,7 +57,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( @@ -213,7 +213,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 66e5b80392f9..93a629d81e8f 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP @@ -163,6 +164,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 5bc3886fa7ff..4744d8e44f39 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -54,6 +54,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -197,6 +198,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size + set_default_rope_theta(config, default_theta=1000000) # By default, SeedOss uses causal attention as it is a # decoder-only model. diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2b8201dbc6a3..41957b4616ee 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -392,6 +392,15 @@ def file_or_path_exists( ) +def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None: + """Some models may have no rope_theta in their config but still use RoPE. + This function sets a default rope_theta if it's missing.""" + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = default_theta + + def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" # Retrieve rope_parameters differently based on Transformers version From 002fb90788ae749626fb09c6272f0cb604de46b6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:20:34 +0100 Subject: [PATCH 63/70] Update models which can default to 10000 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gpt_j.py | 4 ---- vllm/model_executor/models/gpt_neox.py | 4 ---- vllm/model_executor/models/granitemoe.py | 2 +- vllm/model_executor/models/grok1.py | 7 +------ vllm/model_executor/models/internlm2.py | 2 +- vllm/model_executor/models/minicpm.py | 2 +- vllm/model_executor/models/minimax_m2.py | 2 +- vllm/model_executor/models/minimax_text_01.py | 2 +- vllm/model_executor/models/orion.py | 2 +- vllm/model_executor/models/phi.py | 5 ----- vllm/model_executor/models/plamo2.py | 5 ----- vllm/model_executor/models/qwen.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- 13 files changed, 9 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index fd42b52b4e65..e94de8952fa6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,10 +95,6 @@ def __init__( scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - config.rope_parameters["rope_theta"] = 10000 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index d3053a5c99ed..815c2fba4d9f 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,10 +92,6 @@ def __init__( scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - config.rope_parameters["rope_theta"] = 10000 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 300d53369ec9..8f4139d63c3f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -140,8 +140,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position: int = 4096 * 32, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 573ab762a41c..4bf23cd6fd19 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -134,8 +134,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position: int = 4096 * 32, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -234,11 +234,6 @@ def __init__( if not self.use_fp8 and hasattr(quant_config, "is_fp8"): self.use_fp8 = quant_config.is_fp8 - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - # Default rope_theta value if not in config - config.rope_parameters["rope_theta"] = 10000 self.attn = Grok1Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index bc08367c5b9c..dc8f821bd134 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -91,7 +91,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 7791c52808d8..04923833065f 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -230,7 +230,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index f0874dc36a2d..4955c68c0cda 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -149,7 +149,7 @@ def __init__( num_heads: int, num_kv_heads: int, rotary_dim: int, - rope_parameters: dict[str, Any], + rope_parameters: dict[str, Any] | None = None, attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 90ae5b832a00..50f7396e2de6 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -187,8 +187,8 @@ def __init__( head_dim: int, num_kv_heads: int, rotary_dim: int, - rope_parameters: dict, max_position: int = 4096 * 32, + rope_parameters: dict | None = None, sliding_window: int | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 07cd2357a49e..b30be93ca726 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -88,7 +88,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index e62621aeac4e..da476f621627 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -115,11 +115,6 @@ def __init__( ) assert rotary_dim % 2 == 0 - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - # Refer to https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - config.rope_parameters["rope_theta"] = 10000.0 max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 6fdf9e4105b9..5dd72227c3f5 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -576,11 +576,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No prefix=f"{prefix}.o_proj", ) - if getattr(config, "rope_parameters", None) is None: - config.rope_parameters = {"rope_type": "default"} - if "rope_theta" not in config.rope_parameters: - config.rope_parameters["rope_theta"] = 10000 - max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 6906a73579d1..c973e7917098 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -83,7 +83,7 @@ def __init__( hidden_size: int, num_heads: int, max_position_embeddings: int, - rope_parameters: dict[str, Any], + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5687897719b1..6b97d0b2ca2e 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -194,7 +194,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, From 99c5d476a7b7295115e246cf2a349697bd2b2127 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 13:27:38 +0100 Subject: [PATCH 64/70] Fix nemotron config Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/configs/nemotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 3b535979ade0..377b73ce088e 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -173,7 +173,7 @@ def __init__( or partial_rotary_factor ) self.partial_rotary_factor = partial_rotary_factor - self._rope_scaling_validation() + self._rope_parameters_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias From c38e8bbb2a19f29fc2283fdce53ecc5def66af66 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 13:30:59 +0100 Subject: [PATCH 65/70] Fix ernie 4.5 vl Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/ernie45_vl_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 5b770bfbaf8e..50e033d77606 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -154,7 +154,7 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position_embeddings=max_position_embeddings, - rope_parameters=rope_parameters, + base=rope_parameters["rope_theta"], is_neox_style=False, dtype=torch.get_default_dtype(), mrope_section=[h_rope, w_rope, t_rope], From eebe73c5cf0479cf4ed15b14eb6d2f403da0250e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 13:36:29 +0100 Subject: [PATCH 66/70] Fix benchmarks/tests where `get_rope` is called with positional arguments Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 8 ++++---- tests/kernels/core/test_mrope.py | 4 ++-- tests/kernels/core/test_pos_encoding.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 8866bfdfac88..83bd91917508 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -85,9 +85,9 @@ def benchmark_mrope( tp_size: int, num_heads: int, num_kv_heads: int, - rope_parameters: dict[str, Any], max_position: int = 8192, is_neox_style: bool = True, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -101,8 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - rope_parameters=rope_parameters, is_neox_style=is_neox_style, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -298,8 +298,8 @@ def benchmark_mrope( head_dim = config.hidden_size // total_num_heads q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim - rope_parameters = config.rope_parameters is_neox_style = True + rope_parameters = config.rope_parameters max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -311,8 +311,8 @@ def benchmark_mrope( num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_parameters=rope_parameters, is_neox_style=is_neox_style, + rope_parameters=rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 2792a5e7ab0d..43b242ab2d58 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -120,8 +120,8 @@ def test_mrope( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) @@ -191,8 +191,8 @@ def test_mrope_torch_compile_tracing( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 9ce7d508126f..1d8e8e47c804 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -84,7 +84,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - rope = get_rope(head_size, rotary_dim, max_position, rope_parameters, is_neox_style) + rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -181,8 +181,8 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - rope_parameters, is_neox_style, + rope_parameters, dtype, ) # check if cache take effect From a60b5ec2009922c143d16c5a190391976b6b5562 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 15:04:44 +0100 Subject: [PATCH 67/70] Fix get_rope kwargs in vision transformers Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/glm4_1v.py | 1 - vllm/model_executor/models/qwen2_5_vl.py | 1 - vllm/model_executor/models/qwen2_vl.py | 1 - vllm/model_executor/models/qwen3_omni_moe_thinker.py | 1 - vllm/model_executor/models/qwen3_vl.py | 1 - 5 files changed, 5 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 65c3fc2d9e97..48d9085ba0b1 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -703,7 +703,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 2e4fd9645d88..10e9705792de 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,7 +641,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 53df5972a8fe..119daa7a1ed6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -643,7 +643,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 8274b92138f7..07085f8b860b 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -338,7 +338,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 99a4007ef7f2..430bbcd39360 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -345,7 +345,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) From 00f285320131530ebbf77942b51f98439ca10229 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 15:06:43 +0100 Subject: [PATCH 68/70] Update new model Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/afmoe.py | 17 +---------------- vllm/transformers_utils/configs/afmoe.py | 7 +++++-- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 6f654f47495f..4eb5665a71fc 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -5,7 +5,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -202,7 +199,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # Check if this is a local attention layer @@ -246,8 +242,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config["rope_parameters"], is_neox_style=True, ) else: @@ -303,14 +298,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix @@ -323,8 +310,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py index 9b634fd037a3..47fee9882f9f 100644 --- a/vllm/transformers_utils/configs/afmoe.py +++ b/vllm/transformers_utils/configs/afmoe.py @@ -24,7 +24,7 @@ def __init__( rms_norm_eps: float = 1e-5, use_cache: bool = True, tie_word_embeddings: bool = False, - rope_theta: float = 10000.0, + rope_parameters: dict | None = None, rope_scaling: dict | None = None, num_experts: int = 64, num_experts_per_tok: int = 6, @@ -56,7 +56,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 10000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.rope_scaling = rope_scaling self.moe_intermediate_size = moe_intermediate_size From 717a7044e19910519708fce402e0d98b48e0121c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 15:10:45 +0100 Subject: [PATCH 69/70] Missed positional args Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/kernels/core/test_pos_encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 1d8e8e47c804..a8ed3825689d 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -154,8 +154,8 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - rope_parameters, is_neox_style, + rope_parameters, dtype, ) # different settings cannot share the same rope module From a9fa3b0430525c20e7a88f1d699ae8ebd4c37501 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 18 Nov 2025 17:07:10 +0100 Subject: [PATCH 70/70] Fix nemotron config validation Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/configs/nemotron.py | 39 +++++++++------------ 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 377b73ce088e..d112c71d7d20 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -193,27 +193,22 @@ def _rope_parameters_validation(self): if self.rope_parameters is None: return - if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 3: - raise ValueError( - "`rope_parameters` must be a dictionary with three fields, " - f"`rope_theta`, `rope_type` and `factor`, got {self.rope_parameters}" - ) - rope_parameters_type = self.rope_parameters.get("rope_type", None) - rope_parameters_factor = self.rope_parameters.get("factor", None) - if rope_parameters_type is None or rope_parameters_type not in [ - "linear", - "dynamic", - ]: - raise ValueError( - "`rope_parameters`'s type field must be one of ['linear', " - f"'dynamic'], got {rope_parameters_type}" - ) - if ( - rope_parameters_factor is None - or not isinstance(rope_parameters_factor, float) - or rope_parameters_factor <= 1.0 - ): + rope_type: str | None = self.rope_parameters.get("rope_type", None) + factor: float | None = self.rope_parameters.get("factor", None) + + if rope_type not in {"default", "linear", "dynamic"}: raise ValueError( - "`rope_parameters`'s factor field must be a float > 1, got " - f"{rope_parameters_factor}" + "`rope_type` must be one of ['default', 'linear', 'dynamic'], " + f"got {rope_type}" ) + if rope_type != "default": + if factor is None: + raise ValueError( + "If `rope_type` is not 'default', `rope_parameters` " + "must include a `factor` field. Got `None`." + ) + if not isinstance(factor, float) or factor <= 1.0: + raise ValueError( + "`rope_parameters`'s factor field must be a float > 1, got " + f"{factor}" + )