diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4ac76aba67b9..31f57e50a272 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -876,12 +876,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index cb848d2bf579..83bd91917508 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,7 +6,7 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, # speedup # @@ -86,9 +86,8 @@ def benchmark_mrope( num_heads: int, num_kv_heads: int, max_position: int = 8192, - rope_theta: float = 10000, is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -102,9 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -203,9 +201,8 @@ def benchmark_mrope( num_kv_heads, head_dim, max_position, - rope_theta, is_neox_style, - str(rope_scaling), + str(rope_parameters), str(dtype).split(".")[-1], torch_stats["mean"], torch_stats["median"], @@ -255,9 +252,8 @@ def benchmark_mrope( "num_kv_heads", "head_dim", "max_position", - "rope_theta", "is_neox_style", - "rope_scaling", + "rope_parameters", "dtype", "torch_mean", "torch_median", @@ -303,7 +299,7 @@ def benchmark_mrope( q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim is_neox_style = True - rope_theta = config.rope_theta + rope_parameters = config.rope_parameters max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -315,9 +311,8 @@ def benchmark_mrope( num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_theta=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index df39e4c25d5c..67d33e1881ee 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This script demonstrates how to extend the context length -of a Qwen model using the YARN method (rope_scaling) +of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. Usage: @@ -19,8 +19,8 @@ def create_llm(): # Use yarn to extend context hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 11ae96e930da..515e0a93ac2a 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -137,7 +137,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000): self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, q, k): @@ -172,7 +172,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, hidden_states): diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 02b795721f46..43b242ab2d58 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -5,11 +5,11 @@ import pytest import torch from packaging.version import Version -from transformers import AutoConfig from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -98,8 +98,7 @@ def test_mrope( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -113,7 +112,6 @@ def test_mrope( ) is_neox_style = True - rope_theta = config.rope_theta max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -122,9 +120,8 @@ def test_mrope( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) @@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing( else config.hidden_size // total_num_heads ) is_neox_style = True - rope_theta = config.rope_theta max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index c35ee5016ba0..a8ed3825689d 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -74,7 +74,7 @@ def test_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: float = 10000, + rope_theta: float = 10000, ) -> None: if rotary_dim is None: rotary_dim = head_size @@ -83,7 +83,8 @@ def test_rotary_embedding( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -120,9 +121,9 @@ def test_rotary_embedding( @torch.inference_mode() def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] - BASES = [10000, 1000000] - ROPE_SCALINGS = ( - None, + ROPE_THETAS = [10000, 1000000] + ROPE_PARAMETERS = ( + {"rope_type": "default"}, {"rope_type": "linear", "factor": (1,)}, {"rope_type": "dynamic", "factor": 1}, ) @@ -130,9 +131,9 @@ def test_rope_module_cache(): HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, - BASES, + ROPE_THETAS, IS_NEOX_STYLE, - ROPE_SCALINGS, + ROPE_PARAMETERS, DTYPES, ) rope_setting_id_map: dict[str, int] = {} @@ -141,20 +142,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + rope_theta, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # different settings cannot share the same rope module @@ -168,20 +169,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + rope_theta, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # check if cache take effect diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index dfd317bcf72f..af33fd4e3fc3 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -201,7 +201,7 @@ class ModelConfig: sliding_window: int = 128 initial_context_length: int = 4096 rope_theta: float = 150000.0 - rope_scaling_factor: float = 32.0 + rope_parameters_factor: float = 32.0 rope_ntk_alpha: float = 1.0 rope_ntk_beta: float = 32.0 diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 88f088c60327..d6216a87a229 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 +from typing import Any + import pytest from ...utils import EmbedModelInfo @@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_legal(model_info, vllm_runner): hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_illegal(model_info, vllm_runner): - hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + hf_overrides: dict[str, Any] = { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): pass hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/test_config.py b/tests/test_config.py index bba2fbec3db2..16f68d18fc68 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -249,45 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config(): def test_rope_customization(): - TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} - TEST_ROPE_THETA = 16_000_000.0 - LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} + TEST_ROPE_PARAMETERS = { + "rope_theta": 16_000_000.0, + "rope_type": "dynamic", + "factor": 2.0, + } + LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"} + LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0} llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct") - assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None - assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000 + assert ( + getattr(llama_model_config.hf_config, "rope_parameters", None) + == LLAMA_ROPE_PARAMETERS + ) assert llama_model_config.max_model_len == 8192 llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, - "rope_theta": TEST_ROPE_THETA, - }, + hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS}, ) assert ( - getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING + getattr(llama_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_PARAMETERS ) - assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 longchat_model_config = ModelConfig("lmsys/longchat-13b-16k") - # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config + # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config assert all( - longchat_model_config.hf_config.rope_scaling.get(key) == value - for key, value in LONGCHAT_ROPE_SCALING.items() + longchat_model_config.hf_config.rope_parameters.get(key) == value + for key, value in LONGCHAT_ROPE_PARAMETERS.items() ) assert longchat_model_config.max_model_len == 16384 longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, + "rope_parameters": TEST_ROPE_PARAMETERS, }, ) assert ( - getattr(longchat_model_config.hf_config, "rope_scaling", None) - == TEST_ROPE_SCALING + getattr(longchat_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_PARAMETERS ) assert longchat_model_config.max_model_len == 4096 diff --git a/vllm/config/model.py b/vllm/config/model.py index b3a28af6de38..3a85b8ad09f4 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -13,6 +13,7 @@ from pydantic import ConfigDict, SkipValidation, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig @@ -2077,31 +2078,32 @@ def _get_and_verify_max_len( ) derived_max_model_len = default_max_len - rope_scaling = getattr(hf_config, "rope_scaling", None) + # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. + # To simplify the verification, we convert it to dict[str, TypedDict]. + rope_parameters = getattr(hf_config, "rope_parameters", None) + if rope_parameters and not set(rope_parameters.keys()).issubset( + ALLOWED_LAYER_TYPES + ): + rope_parameters = {"": rope_parameters} + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # scaling, so we skip applying the scaling factor again. - if rope_scaling is not None and "gemma3" not in hf_config.model_type: - # No need to consider "type" key because of patch_rope_scaling when - # loading HF config - rope_type = rope_scaling["rope_type"] - - if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: - # TODO(robertgshaw): Find a model that supports rope_scaling - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "with rope_scaling. Please raise an issue so we can " - "investigate." - ) - - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rope_scaling.get("factor", 1.0) - - if rope_type == "yarn": - derived_max_model_len = rope_scaling["original_max_position_embeddings"] - derived_max_model_len *= scaling_factor + if rope_parameters is not None and "gemma3" not in hf_config.model_type: + scaling_factor = 1.0 + for rp in rope_parameters.values(): + # No need to consider "type" key because of patch_rope_parameters when + # loading HF config + rope_type = rp["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + # NOTE: This assumes all layer types have the same scaling factor. + scaling_factor = rp.get("factor", scaling_factor) + + if rope_type == "yarn": + derived_max_model_len = rp["original_max_position_embeddings"] + # Do this outside loop since all layer types should have the same scaling + derived_max_model_len *= scaling_factor if encoder_config and "max_seq_length" in encoder_config: derived_max_model_len = encoder_config["max_seq_length"] @@ -2111,7 +2113,9 @@ def _get_and_verify_max_len( if max_model_len is None: # For LongRoPE, default to original_max_position_embeddings to avoid # performance degradation for shorter sequences - if rope_scaling is not None and rope_scaling["rope_type"] == "longrope": + if rope_parameters is not None and any( + rp["rope_type"] == "longrope" for rp in rope_parameters.values() + ): max_model_len = int( getattr( hf_config, "original_max_position_embeddings", derived_max_model_len @@ -2128,16 +2132,7 @@ def _get_and_verify_max_len( # that will be bigger than derived_max_model_len. We compare user input # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window: - # TODO(robertgshaw): Find a model that has model_max_length - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "model_max_length in the config. Please raise an issue " - "so we can investigate." - ) - else: + if model_max_length is None or max_model_len > model_max_length: msg = ( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 56c165f9c041..ae8a7d93b50e 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -26,23 +26,23 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - base: float, is_neox_style: bool = True, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() - if rope_scaling is not None: + if rope_parameters is not None: # Transforms every value that is a list into a tuple for caching calls - rope_scaling_tuple = { - k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items() + rope_parameters_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in rope_parameters.items() } - rope_scaling_args = tuple(rope_scaling_tuple.items()) + rope_parameters_args = tuple(rope_parameters_tuple.items()) else: - rope_scaling_args = None + rope_parameters_args = None if dual_chunk_attention_config is not None: dual_chunk_attention_tuple = { @@ -60,15 +60,15 @@ def get_rope( head_size, rotary_dim, max_position, - base, is_neox_style, - rope_scaling_args, + rope_parameters_args, dual_chunk_attention_args, dtype, ) if key in _ROPE_DICT: return _ROPE_DICT[key] + base = rope_parameters["rope_theta"] if rope_parameters else 10000 if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -84,18 +84,18 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_scaling: + elif not rope_parameters: rotary_emb = RotaryEmbedding( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) else: - scaling_type = rope_scaling["rope_type"] + scaling_type = rope_parameters["rope_type"] if scaling_type == "llama3": - scaling_factor = rope_scaling["factor"] - low_freq_factor = rope_scaling["low_freq_factor"] - high_freq_factor = rope_scaling["high_freq_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] rotary_emb = Llama3RotaryEmbedding( head_size, rotary_dim, @@ -113,7 +113,7 @@ def get_rope( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) elif scaling_type == "default": - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: rotary_emb = MRotaryEmbedding( head_size, rotary_dim, @@ -121,8 +121,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), ) else: rotary_emb = RotaryEmbedding( @@ -134,7 +134,7 @@ def get_rope( dtype, ) elif scaling_type == "linear": - scaling_factor = rope_scaling["factor"] + scaling_factor = rope_parameters["factor"] rotary_emb = LinearScalingRotaryEmbedding( head_size, rotary_dim, @@ -145,8 +145,8 @@ def get_rope( dtype, ) elif scaling_type == "ntk": - scaling_factor = rope_scaling["factor"] - mixed_b = rope_scaling.get("mixed_b", None) + scaling_factor = rope_parameters["factor"] + mixed_b = rope_parameters.get("mixed_b") rotary_emb = NTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -158,8 +158,8 @@ def get_rope( mixed_b, ) elif scaling_type == "dynamic": - if "alpha" in rope_scaling: - scaling_alpha = rope_scaling["alpha"] + if "alpha" in rope_parameters: + scaling_alpha = rope_parameters["alpha"] rotary_emb = DynamicNTKAlphaRotaryEmbedding( head_size, rotary_dim, @@ -169,8 +169,8 @@ def get_rope( scaling_alpha, dtype, ) - elif "factor" in rope_scaling: - scaling_factor = rope_scaling["factor"] + elif "factor" in rope_parameters: + scaling_factor = rope_parameters["factor"] rotary_emb = DynamicNTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -185,11 +185,11 @@ def get_rope( "Dynamic rope scaling must contain either 'alpha' or 'factor' field" ) elif scaling_type == "yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -199,7 +199,7 @@ def get_rope( "apply_yarn_scaling", ) } - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: extra_kwargs.pop("apply_yarn_scaling", None) rotary_emb = MRotaryEmbedding( head_size, @@ -208,8 +208,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), scaling_factor=scaling_factor, **extra_kwargs, ) @@ -225,12 +225,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "deepseek_yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] # assert max_position == original_max_position * scaling_factor extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -252,12 +252,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "longrope": - short_factor = rope_scaling["short_factor"] - long_factor = rope_scaling["long_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + short_factor = rope_parameters["short_factor"] + long_factor = rope_parameters["long_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ("short_mscale", "long_mscale") } rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 6f654f47495f..4eb5665a71fc 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -5,7 +5,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -202,7 +199,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # Check if this is a local attention layer @@ -246,8 +242,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config["rope_parameters"], is_neox_style=True, ) else: @@ -303,14 +298,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix @@ -323,8 +310,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 0a8f21abb0a3..b75e91319bba 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -118,8 +117,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -155,7 +152,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,9 +172,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -224,7 +218,6 @@ def forward( def _init_rotary_emb( self, config: ApertusConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -236,8 +229,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=int(self.partial_rotary_factor * self.head_dim), max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -253,14 +245,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -288,8 +272,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 20c3ff075450..b3887b16f4d7 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -103,15 +103,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Rotary embedding parameters (reuse LLaMA defaults) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Determine if attention bias is needed (some variants use bias terms) attention_bias = getattr(config, "attention_bias", False) or getattr( @@ -133,8 +124,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b5cc07a56535..b75a254761d4 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -292,7 +292,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.scaling = self.head_dim**-0.5 self.qkv_proj = QKVParallelLinear( @@ -317,7 +316,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 8991ef4c606b..edf47270e527 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -136,7 +136,7 @@ def __init__( hidden_size: int, num_heads: int, position_embedding: str, - rope_theta: float = 10000, + rope_parameters: dict, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -150,7 +150,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = hidden_size // self.total_num_heads self.position_embedding = position_embedding - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # pylint: disable=invalid-name @@ -192,7 +191,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( @@ -229,13 +228,12 @@ def __init__( ): super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = BaiChuanAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, position_embedding=position_embedding, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 024425bb2440..cc10e936a2d3 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -135,9 +135,8 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, - rope_scaling=config.rope_scaling, partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index e0a2defd5127..16648929c577 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -157,8 +157,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -179,7 +177,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -193,8 +190,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=torch.get_default_dtype(), # see impl of get_rope ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index fb7476c45fcd..792c15ba4ff8 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -264,8 +264,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -292,7 +291,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -317,8 +315,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -368,14 +365,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -384,8 +373,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, @@ -438,14 +426,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -454,8 +434,7 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 5d6f5e9125a2..dbfcd62d0bca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -99,6 +99,7 @@ def __init__( # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) + rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio} # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope @@ -106,7 +107,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim // 2, max_position=max_positions, - base=10000 * rope_ratio, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 77bb17851981..5ed920927c77 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -156,8 +156,6 @@ def __init__( self.max_position_embeddings = getattr( config, "model_max_length", None ) or getattr(config, "max_position_embeddings", 8192) - self.rope_theta = config.rope_theta - self.rope_scaling = getattr(config, "rope_scaling", None) self.use_qk_norm = getattr(config, "use_qk_norm", False) self.qkv_proj = QKVParallelLinear( self.hidden_size, @@ -179,8 +177,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 66b246878b0a..3cf4bf991e66 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -8,6 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform +from vllm.transformers_utils.config import set_default_rope_theta from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec @@ -46,8 +47,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } @@ -78,12 +78,13 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if not model_config.enforce_eager: max_position = round_up(max_position, 8) + set_default_rope_theta(config, default_theta=config.rotary_emb_base) + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } @@ -117,18 +118,20 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) + + set_default_rope_theta(config, default_theta=config.rotary_emb_base) + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } # we ignore config.rotary_scaling_factor so that for datasets shorter # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. - # The context extension uses vllm style rope_theta and rope_scaling. + # The context extension uses vllm style rope_theta and rope_parameters. # See #17785 #18755 if ( not vllm_config.model_config.hf_overrides @@ -172,7 +175,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") hf_text_config.max_position_embeddings = max_trained_positions - hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"] # The priority of sentence_bert_config.json is higher # than max_position_embeddings @@ -246,8 +249,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 528ef4f76742..2c729019081a 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -197,7 +197,10 @@ def __init__( self.head_dim = self.d_model // self.total_num_heads self.total_num_kv_heads = config.attn_config.kv_n_heads self.clip_qkv = config.attn_config.clip_qkv - self.rope_theta = config.attn_config.rope_theta + rope_parameters = { + "rope_type": "default", + "rope_theta": int(config.attn_config.rope_theta), + } self.max_position = config.max_seq_len # pylint: disable=invalid-name @@ -221,7 +224,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index e8ee9951d611..6675b2133f38 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ def __init__( config: DeepseekV2Config | DeepseekV3Config, hidden_size: int, num_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -139,7 +136,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -162,8 +158,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -409,8 +404,6 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -430,7 +423,6 @@ def __init__( assert num_heads % tp_size == 0 self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings assert topk_indices_buffer is None, ( "topk_indices_buffer is not \ @@ -485,21 +477,20 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if config.rope_parameters["rope_type"] != "default": + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if config.rope_parameters["rope_type"] != "default": + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -903,8 +894,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -927,7 +916,6 @@ def __init__( self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: @@ -981,19 +969,18 @@ def __init__( prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if config.rope_parameters["rope_type"] != "default": + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if config.rope_parameters["rope_type"] != "default": + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -1073,8 +1060,6 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) moe_layer_freq = getattr(config, "moe_layer_freq", 1) # DecoderLayers are created with `make_layers` which passes the prefix @@ -1107,8 +1092,6 @@ def __init__( v_head_dim=v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index d24da0c42a25..e65c275106a4 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -202,8 +201,6 @@ def __init__( num_heads: int, num_kv_heads: int, config: Dots1Config, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -229,7 +226,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings attention_bias = config.attention_bias @@ -255,8 +251,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -296,8 +291,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) self.layer_idx = layer_idx @@ -307,8 +300,6 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, config=config, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index f2999968669f..a7df3509e3ec 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -62,6 +62,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( @@ -232,9 +233,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, @@ -266,7 +266,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -291,9 +290,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=False, - rope_scaling=rope_scaling, ) self.attn = Attention( self.num_heads, @@ -333,16 +331,14 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=500000) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) self.self_attn = Ernie4_5_MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index e8ef86f9b7f0..50e033d77606 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -58,6 +58,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .ernie45_moe import Ernie4_5_MoeMLP from .interfaces import SupportsPP @@ -91,9 +92,8 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, freq_allocation: int = 20, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, @@ -126,7 +126,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -155,7 +154,7 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position_embeddings=max_position_embeddings, - base=rope_theta, + base=rope_parameters["rope_theta"], is_neox_style=False, dtype=torch.get_default_dtype(), mrope_section=[h_rope, w_rope, t_rope], @@ -413,8 +412,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=500000) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) @@ -423,8 +421,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, freq_allocation=freq_allocation, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 6c56bfc433c7..d13275488fe9 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -113,8 +112,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -144,7 +141,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -173,8 +169,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -207,8 +202,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -221,8 +214,6 @@ def __init__( hidden_size=hidden_size, num_heads=num_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=bias, @@ -251,14 +242,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -272,8 +255,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index b89e168ada20..70f3cce2b7c5 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -23,7 +23,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -52,6 +51,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -110,8 +110,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 1000000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -141,7 +139,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,12 +173,12 @@ def __init__( # apply rotary embeddings to every layer in full attention models self.apply_rope_all_layers = "sliding_attention" not in config.layer_types + set_default_rope_theta(config, default_theta=1000000) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -227,14 +224,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -249,8 +238,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 85acdff3d96b..dc2d51f340c8 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -164,13 +164,12 @@ def __init__( ) if self.use_rotary: - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 3653425b8e1c..f661cbe19f17 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -35,6 +35,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import ( HasInnerState, @@ -216,8 +217,7 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 1e11) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1e11) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -242,7 +242,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -256,8 +255,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=None, # see impl of get_rope ) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 7aaae7c503b5..00c7f59a0809 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -20,6 +20,7 @@ from collections.abc import Iterable from functools import cache from itertools import islice +from typing import Any import torch from torch import nn @@ -127,8 +128,8 @@ def __init__( num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -153,7 +154,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -176,7 +176,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -218,7 +218,7 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4d5d6cbb37c6..9b6cfe693230 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -107,7 +107,6 @@ def __init__( num_kv_heads: int, head_dim: int, max_position_embeddings: int, - rope_theta: float, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, @@ -134,7 +133,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.query_pre_attn_scalar**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -156,7 +154,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, ) @@ -206,7 +204,6 @@ def __init__( num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, cache_config=cache_config, quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 357e61a4e78b..565719ae7fae 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -155,25 +155,28 @@ def __init__( self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) layer_idx = extract_layer_index(prefix) - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + self.is_sliding = layer_type == "sliding_attention" sliding_window = config.sliding_window if self.is_sliding else None # Initialize the rotary embedding. - if self.is_sliding: - # Local attention. Override the values in config.json. - self.rope_theta = config.rope_local_base_freq - self.rope_scaling = {"rope_type": "default"} + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] else: + # Transformers v4 rope config. # Global attention. Use the values in config.json. - self.rope_theta = config.rope_theta - self.rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters.copy() + # Local attention. Override the values in config.json. + if self.is_sliding: + rope_parameters["rope_theta"] = config.rope_local_base_freq + self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=self.rope_scaling, ) if getattr(config, "is_causal", True): diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 64443190f53e..8f1447ba34a8 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -332,18 +332,21 @@ def __init__( ) layer_idx = extract_layer_index(prefix) - is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + is_sliding = layer_type == "sliding_attention" self.sliding_window = config.sliding_window if is_sliding else None # Initialize the rotary embedding. - if is_sliding: - # Local attention. Override the values in config.json. - rope_theta = config.rope_local_base_freq - rope_scaling = {"rope_type": "default"} + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] else: + # Transformers v4 rope config. # Global attention. Use the values in config.json. - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters.copy() + # Local attention. Override the values in config.json. + if is_sliding: + rope_parameters["rope_theta"] = config.rope_local_base_freq first_kv_shared_layer_idx = ( config.num_hidden_layers - config.num_kv_shared_layers @@ -383,9 +386,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=rope_scaling, ) self.attn = Attention( diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index faa0674a2e43..f8ef3b0385fb 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -57,10 +57,8 @@ def __init__( max_position: int = 4096 * 32, head_dim: int | None = None, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -86,7 +84,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -107,8 +104,7 @@ def __init__( self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, is_neox_style=False, ) @@ -150,8 +146,6 @@ def __init__( quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = Glm4Attention( config=config, @@ -159,12 +153,10 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 65c3fc2d9e97..48d9085ba0b1 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -703,7 +703,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 1422dbe9b3cd..5aa51af54a00 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -26,7 +26,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -233,8 +232,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -264,7 +261,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = use_qk_norm @@ -291,8 +287,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( @@ -341,8 +336,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. @@ -354,8 +347,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index e416ecde0c1e..e94de8952fa6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,13 +95,12 @@ def __init__( scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index af0c9209231c..815c2fba4d9f 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,13 +92,12 @@ def __init__( scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7df3b087ccb8..f310f71af92d 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,16 +67,16 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, dtype=torch.float32, - rope_scaling={ + rope_parameters={ + "rope_theta": config.rope_parameters["rope_theta"], "rope_type": "yarn", - "factor": config.rope_scaling["factor"], - "original_max_position_embeddings": config.rope_scaling[ + "factor": config.rope_parameters["factor"], + "original_max_position_embeddings": config.rope_parameters[ "original_max_position_embeddings" ], - "beta_fast": config.rope_scaling["beta_fast"], - "beta_slow": config.rope_scaling["beta_slow"], + "beta_fast": config.rope_parameters["beta_fast"], + "beta_slow": config.rope_parameters["beta_slow"], }, is_neox_style=True, ) @@ -90,7 +90,6 @@ def __init__( self.q_size = self.num_attention_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta self.qkv_proj = QKVParallelLinear( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c44b4021471e..1dc205b47753 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -112,8 +111,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -143,7 +140,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.attention_multiplier - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -167,8 +163,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,14 +199,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -225,8 +212,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 5c6759ded066..8f4139d63c3f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -141,8 +141,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -172,7 +171,6 @@ def __init__( if attention_multiplier is not None else self.head_dim**-1 ) - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -194,9 +192,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=rope_scaling, ) self.attn = Attention( self.num_heads, @@ -235,16 +232,12 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 05177f1d1ac2..1a952107948e 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -274,10 +274,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=int(config.rope_theta), - rope_scaling=config.rope_scaling - if hasattr(config, "rope_scaling") and config.rope_scaling is not None - else None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) else: diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 926c539af33b..fd346db7e35a 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -84,16 +84,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 9dc231863f74..4bf23cd6fd19 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -25,6 +25,7 @@ from collections.abc import Iterable from itertools import islice +from typing import Any import torch import torch.nn.functional as F @@ -134,7 +135,7 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -161,7 +162,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -183,7 +183,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) @@ -234,15 +234,12 @@ def __init__( if not self.use_fp8 and hasattr(quant_config, "is_fp8"): self.use_fp8 = quant_config.is_fp8 - # Requires transformers > 4.32.0 - # Default rope_theta value if not in config - rope_theta = 10000 self.attn = Grok1Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 1eadcbe67ade..9fa5e2bd33f2 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import regex as re import torch @@ -142,8 +141,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -177,7 +174,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -204,8 +200,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -254,8 +249,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -289,7 +282,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -314,8 +306,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -494,14 +485,6 @@ def __init__( if isinstance(config.intermediate_size, int) else config.intermediate_size[layer_id] ) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False @@ -520,8 +503,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, @@ -537,8 +518,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 60fbeb842dd4..dc8f821bd134 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -91,8 +91,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -120,7 +119,6 @@ def __init__( self.kv_size = self.num_kv_heads * self.head_dim self.key_value_groups = int(self.num_heads / self.num_kv_heads) self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.wqkv = QKVParallelLinear( @@ -144,8 +142,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,15 +201,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6dc081e34157..a57db82242af 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -30,15 +30,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index f3675075a48f..4562b2202c5e 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -190,9 +189,7 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, use_nope: bool = False, - rope_scaling: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -210,11 +207,9 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.use_nope = use_nope assert self.use_nope is True assert self.q_lora_rank is None - assert rope_scaling is None assert num_heads % tp_size == 0 self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index aeb25602f11a..74bdde27ece5 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -96,8 +95,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -126,7 +123,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -149,8 +145,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -199,14 +194,6 @@ def __init__( self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2Attention( @@ -215,8 +202,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6b7b5564ee98..c088a0821152 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -189,8 +188,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -219,7 +216,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -242,8 +238,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -293,14 +288,6 @@ def __init__( self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2MoeAttention( @@ -309,8 +296,6 @@ def __init__( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 0a3f37c30ab5..d5b49d2fb4c2 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -120,8 +119,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -157,7 +154,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings llama_4_scaling_config = getattr(config, "llama_4_scaling", None) @@ -186,9 +182,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -258,7 +252,6 @@ def forward( def _init_rotary_emb( self, config: LlamaConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -270,8 +263,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -291,14 +283,6 @@ def __init__( quant_config = self.get_quant_config(vllm_config) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -326,8 +310,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a7e0732ec71e..4c6d1d424475 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -19,7 +19,6 @@ """Inference-only LLaMA model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -208,7 +205,6 @@ def __init__( self.floor_scale = getattr(config, "floor_scale", 8192.0) self.attn_scale = getattr(config, "attn_scale", 0.1) - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.n_rep = self.num_heads // self.num_kv_heads self.qk_norm = ( @@ -248,8 +244,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=int(rope_theta), - rope_scaling=rope_scaling if rope_scaling != "default" else None, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) if not self.nope @@ -331,8 +326,6 @@ def __init__( self.layer_idx = extract_layer_index(prefix) self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling max_position_embeddings = config.max_position_embeddings self.self_attn = Llama4Attention( @@ -340,8 +333,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 5de10e708683..fafe97cd2be7 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -108,8 +108,7 @@ def __init__( eos_token_id=100001, pretraining_tp=1, tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, mla_scale_q_lora=False, @@ -162,8 +161,13 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mla_scale_q_lora = mla_scale_q_lora @@ -336,15 +340,7 @@ def __init__( super().__init__() self.layer_idx = int(prefix.split(sep=".")[-1]) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) # Dual attention structure self.self_attn = nn.ModuleList( @@ -361,8 +357,6 @@ def __init__( config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=None diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 914b097fe199..04923833065f 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -230,8 +230,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -257,7 +256,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -281,8 +279,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -324,8 +321,6 @@ def __init__( self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -339,8 +334,7 @@ def _init_attn_block(self): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index d3b6966ee3a7..2d775219fc97 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -25,8 +25,6 @@ # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any - import torch from torch import nn from transformers import PretrainedConfig @@ -62,8 +60,6 @@ def __init__( v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -84,7 +80,6 @@ def __init__( self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.q_a_proj = ReplicatedLinear( @@ -127,8 +122,7 @@ def __init__( self.qk_rope_head_dim, rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_local_heads, @@ -204,8 +198,6 @@ def _init_attn_block(self): v_head_dim=self.config.v_head_dim, q_lora_rank=self.config.q_lora_rank, kv_lora_rank=self.config.kv_lora_rank, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index d0cdb70aa857..e6bccfcac4f1 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -69,8 +69,6 @@ def __init__( self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -84,8 +82,7 @@ def _init_attn_block(self): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 49d2f2d26196..4955c68c0cda 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -149,8 +149,7 @@ def __init__( num_heads: int, num_kv_heads: int, rotary_dim: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, @@ -180,7 +179,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -205,8 +203,7 @@ def __init__( self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -252,8 +249,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = max( @@ -269,8 +264,7 @@ def __init__( num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rotary_dim=config.rotary_dim, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index bf1ecc822756..50f7396e2de6 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -188,7 +188,7 @@ def __init__( num_kv_heads: int, rotary_dim: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, + rope_parameters: dict | None = None, sliding_window: int | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, @@ -214,7 +214,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.sliding_window = sliding_window self.prefix = prefix @@ -247,7 +246,7 @@ def __init__( head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=int(rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, dtype=torch.float32, ) @@ -287,8 +286,6 @@ def __init__( self.hidden_size = config.hidden_size self.expert_num = expert_num - rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", None) if head_dim is None: head_dim = config.hidden_size // config.num_attention_heads @@ -328,7 +325,7 @@ def __init__( else head_dim, num_kv_heads=config.num_key_value_heads, max_position=max_position_embeddings, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, sliding_window=config.sliding_window, quant_config=quant_config, layer_idx=self._ilayer, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index d7a1cb82fb4f..54ab8dd493e7 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -161,7 +161,6 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -189,7 +188,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -211,7 +209,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -248,15 +246,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = MixtralAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e25a104d822a..286859d188d3 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -292,13 +292,17 @@ def __init__( prefix=f"{prefix}.o_proj", ) + rope_parameters = { + "rope_type": "mllama4", + "rope_theta": config.rope_parameters["rope_theta"], + } + self.rotary_emb = get_rope( head_size=self.head_dim, rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, - base=config.rope_theta, - rope_scaling={"rope_type": "mllama4"}, + rope_parameters=rope_parameters, is_neox_style=False, dtype=torch.complex64, # important ) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ab83a271e30a..dc06938d5d6e 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -410,7 +410,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -437,7 +436,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 92dcf5ea5700..c3337bd1ea69 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -150,8 +149,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -181,7 +178,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.max_position_embeddings = max_position_embeddings @@ -206,8 +202,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( @@ -243,14 +238,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -264,8 +251,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index b839206a3094..2eebe38051cb 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -82,8 +81,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -97,8 +94,6 @@ def __init__( hidden_size, num_heads, num_kv_heads, - rope_theta, - rope_scaling, max_position_embeddings, quant_config, bias, @@ -111,7 +106,6 @@ def __init__( def _init_rotary_emb( self, config, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: # Enables YARN for Mistral and LLaMA4 derivatives. @@ -126,8 +120,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -148,14 +141,6 @@ def __init__( self._is_no_op_ffn = block_config.ffn.no_op self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -176,8 +161,6 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 487e3f671a45..bd8a8e317544 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -87,7 +87,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.clip_qkv = config.clip_qkv # Attention input projection. Projects x -> (q, k, v) @@ -105,7 +104,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 045582c889ee..f0f6b2f6b3e6 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -99,7 +99,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = self.config.max_position_embeddings - self.rope_theta = self.config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -139,15 +138,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=f"{prefix}.attn", ) - # Rotary embeddings. Rope scaling is only applied on full attention - # layers. - self.rope_scaling = self.config.rope_scaling if sliding_window is None else None + # Rotary embeddings. Rope scaling is only applied on full attention layers. + if sliding_window is None: + rope_parameters = self.config.rope_parameters + else: + rope_theta = self.config.rope_parameters["rope_theta"] + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, # type: ignore - rope_scaling=self.rope_scaling, + rope_parameters=rope_parameters, ) # Attention output projection. diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 499eb05de76e..c39e338d72e2 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -123,8 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) num_heads = config.num_attention_heads @@ -148,7 +146,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,8 +173,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index d13a745beffe..f814cdfec5a2 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -77,6 +77,7 @@ sequence_parallel_chunk, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta def check_ffn_act_fn(act_fn: str): @@ -259,7 +260,6 @@ def __init__( v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -274,8 +274,6 @@ def __init__( self.v_head_dim = v_head_dim self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank - self.rope_theta = rope_theta - self.tp_size = get_tensor_model_parallel_world_size() if num_heads % self.tp_size != 0: raise ValueError( @@ -339,7 +337,9 @@ def __init__( ) # TODO: remove hard coding - rope_scaling = { + set_default_rope_theta(config, default_theta=10000) + rope_parameters = { + "rope_theta": config.rope_parameters["rope_theta"], "beta_fast": 32, "beta_slow": 1, "factor": 1, @@ -353,8 +353,7 @@ def __init__( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=False, ) @@ -407,8 +406,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -454,7 +451,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -475,9 +471,7 @@ def __init__( prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) if hasattr(config, "interleaved_sliding_window"): interleaved_sliding_window = config.interleaved_sliding_window @@ -521,7 +515,6 @@ def forward( def _init_rotary_emb( self, config: PretrainedConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -533,8 +526,7 @@ def _init_rotary_emb( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) @@ -555,7 +547,6 @@ def __init__( parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) @@ -579,7 +570,6 @@ def __init__( config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, @@ -607,8 +597,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=getattr(config, "rope_scaling", None), max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 859cd2cecf89..b30be93ca726 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -88,8 +88,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -115,7 +114,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -139,8 +137,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -175,15 +172,12 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 9db6c317c26a..63d2fff6ec8b 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -112,10 +112,8 @@ def __init__( num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -140,7 +138,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config # Get total_ut_steps from config, default to 4 if not specified @@ -170,8 +167,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = nn.ModuleList() @@ -226,9 +222,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -244,10 +237,8 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3bf6a1d9763d..98963d52e484 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -106,7 +106,6 @@ def __init__( self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True @@ -138,7 +137,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 8fee53c23fb4..da476f621627 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -115,16 +115,12 @@ def __init__( ) assert rotary_dim % 2 == 0 - # pylint: disable=C0301 - # Refer to: - # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - rope_theta = getattr(config, "rope_theta", 10000.0) max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 92fd858b608b..8ffac95d9396 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -86,7 +86,7 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters=None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=2, @@ -119,7 +119,9 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + if rope_parameters is None: + rope_theta = kwargs.pop("rope_theta", 1e6) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -302,12 +304,11 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, head_dim: int | None = None, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: dict | None = None, prefix: str = "", ) -> None: super().__init__() @@ -332,8 +333,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.qkv_proj = QKVParallelLinear( hidden_size, @@ -355,9 +354,8 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=self.rope_scaling, ) self.attn = Attention( self.num_heads, @@ -393,7 +391,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = PhiMoEAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -402,10 +399,9 @@ def __init__( head_dim=getattr( config, "head_dim", self.hidden_size // config.num_attention_heads ), - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) self.block_sparse_moe = PhiMoE( diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 0c87f5000ff4..5dd72227c3f5 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -576,10 +576,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No prefix=f"{prefix}.o_proj", ) - self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000 - self.rope_scaling = ( - config.rope_scaling if hasattr(config, "rope_scaling") else None - ) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -590,8 +586,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=config.rope_parameters, ) self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps) self.q_norm.weight = torch.nn.Parameter( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 50a125c3f597..c973e7917098 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -83,8 +83,7 @@ def __init__( hidden_size: int, num_heads: int, max_position_embeddings: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -117,8 +116,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -153,14 +151,11 @@ def __init__( super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, config.max_position_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 1bbb969ce5aa..32b6d6dd07b8 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -57,7 +57,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( @@ -114,11 +114,10 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -143,7 +142,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -167,8 +165,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) attn_cls = ( @@ -216,9 +213,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -237,10 +232,9 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 2e4fd9645d88..10e9705792de 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,7 +641,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 2ff0d19df238..6b97d0b2ca2e 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -194,8 +194,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -222,7 +221,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -248,8 +246,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -291,8 +288,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -301,8 +296,7 @@ def __init__( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 53df5972a8fe..119daa7a1ed6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -643,7 +643,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 8d7f22a33fe6..93a629d81e8f 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP @@ -57,14 +58,13 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, max_position: int = 4096 * 32, head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -89,7 +89,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -113,8 +112,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -166,9 +164,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -187,13 +183,12 @@ def __init__( num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 96751fee800b..8ee3dd99e11d 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -216,8 +216,7 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, head_dim: int | None = None, rms_norm_eps: float = 1e-06, @@ -247,7 +246,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -273,8 +271,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -326,8 +323,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None @@ -336,8 +331,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 86508a7c6431..6809628d4f99 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -753,8 +753,7 @@ def __init__( head_size=self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=config.partial_rotary_factor, dual_chunk_attention_config=self.dual_chunk_attention_config, ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 8274b92138f7..07085f8b860b 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -338,7 +338,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 99a4007ef7f2..430bbcd39360 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -345,7 +345,6 @@ def __init__( head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index bf211d28f184..4744d8e44f39 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -54,6 +54,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -112,11 +113,10 @@ def __init__( num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -140,7 +140,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -163,8 +162,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -200,9 +198,7 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) # By default, SeedOss uses causal attention as it is a # decoder-only model. @@ -219,10 +215,9 @@ def __init__( max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 4ec855f79444..7e9fc51036d2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -25,7 +25,6 @@ """Inference-only Solar model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ def __init__( hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -142,7 +139,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -166,8 +162,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -202,15 +197,6 @@ def __init__( ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -224,8 +210,6 @@ def __init__( num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 06eb7201c1a8..a738fcbb4ee2 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -153,7 +153,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, + rope_parameters=self.config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 0f2942acd500..1118fca3cac9 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -91,7 +91,6 @@ def __init__( self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias @@ -115,7 +114,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 4fff356b29e2..3c377a2c539d 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -36,6 +36,7 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.step3_vl import Step3TextConfig from .interfaces import SupportsPP from .utils import ( @@ -144,9 +145,8 @@ def __init__( num_heads: int, num_kv_heads: int, norm_eps: float, - rope_theta: int, + rope_parameters: dict[str, Any], share_q_dim: int | None = None, - rope_scaling: dict[str, Any] | None = None, max_position_embedding: int = 8192, head_dim: int = 256, cache_config: CacheConfig | None = None, @@ -198,8 +198,7 @@ def __init__( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embedding, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) scaling = self.head_dim**-0.5 self.attn = Attention( @@ -227,15 +226,13 @@ def forward( class Step3TextDecoderLayer(nn.Module): def __init__( self, - config: ModelConfig, + config: Step3TextConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: super().__init__() - config = config.hf_config self.hidden_size = config.hidden_size - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = Step3TextAttention( hidden_size=self.hidden_size, @@ -247,8 +244,7 @@ def __init__( max_position_embedding=config.max_position_embedding, head_dim=config.head_dim, share_q_dim=config.share_q_dim, - rope_theta=config.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) @@ -338,7 +334,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Step3TextDecoderLayer( - config=vllm_config.model_config, + config=config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index 517eb54d53ac..b807f45b5d52 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -22,6 +22,7 @@ import torch from torch import nn +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from vllm.config.utils import getattr_iter from vllm.logger import init_logger @@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: """ text_config = vllm_config.model_config.hf_config.get_text_config() # Dynamic rope scaling is not compatible with torch.compile - rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {} - return rope_scaling.get("rope_type") != "dynamic" + rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {} + if rope_parameters: + # Nest rope_parameters if not nested already to simplify logic + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters = {"": rope_parameters} + return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values()) + return True diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 64e6979c8fcf..5f5f973e0c8d 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -128,7 +128,6 @@ def __init__( tp_size = get_tensor_model_parallel_world_size() self.config = config self.num_hybrid_layers = num_hybrid_layers - self.rope_theta = config.rope_theta self.attention_hidden_size = config.attention_hidden_size self.total_num_attention_heads = config.num_attention_heads @@ -233,8 +232,7 @@ def __init__( head_size=self.attention_head_dim, rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, - base=self.rope_theta, - rope_scaling=None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 49250e071eab..df24738477e7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -7,8 +7,9 @@ from collections.abc import Callable from dataclasses import asdict from functools import cache, partial +from importlib.metadata import version from pathlib import Path -from typing import Any, Literal, TypeVar +from typing import Any, Literal, TypeAlias, TypeVar import huggingface_hub from huggingface_hub import ( @@ -24,7 +25,9 @@ RepositoryNotFoundError, RevisionNotFoundError, ) +from packaging.version import Version from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -390,21 +393,61 @@ def file_or_path_exists( ) -def patch_rope_scaling(config: PretrainedConfig) -> None: +def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None: + """Some models may have no rope_theta in their config but still use RoPE. + This function sets a default rope_theta if it's missing.""" + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = default_theta + + +def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - text_config = getattr(config, "text_config", None) - if text_config is not None: - patch_rope_scaling(text_config) + # Retrieve rope_parameters differently based on Transformers version + if Version(version("transformers")) >= Version("5.0.0.dev0"): + from transformers.modeling_rope_utils import RopeParameters - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None: - patch_rope_scaling_dict(rope_scaling) + rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( + config, "rope_parameters", None + ) + elif hasattr(config, "rope_parameters"): + # We are in Transformers v4 and rope_parameters + # has already been patched for this config + return + else: + # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters + rope_theta: float | None = getattr(config, "rope_theta", None) + rope_scaling: dict | None = getattr(config, "rope_scaling", None) + rope_parameters = rope_scaling + # Move rope_theta into rope_parameters + if rope_theta is not None: + rope_parameters = rope_parameters or {"rope_type": "default"} + rope_parameters["rope_theta"] = rope_theta + # Add original_max_position_embeddings if present + if rope_parameters and ( + ompe := getattr(config, "original_max_position_embeddings", None) + ): + rope_parameters["original_max_position_embeddings"] = ompe + # Write back to config + config.rope_parameters = rope_parameters + + # No RoPE parameters to patch + if rope_parameters is None: + return + + # Handle nested rope_parameters in interleaved sliding attention models + if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + for rope_parameters_layer_type in rope_parameters.values(): + patch_rope_parameters_dict(rope_parameters_layer_type) + else: + patch_rope_parameters_dict(rope_parameters) -def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: - if "rope_type" in rope_scaling and "type" in rope_scaling: - rope_type = rope_scaling["rope_type"] - rope_type_legacy = rope_scaling["type"] +def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: + if "rope_type" in rope_parameters and "type" in rope_parameters: + rope_type = rope_parameters["rope_type"] + rope_type_legacy = rope_parameters["type"] if rope_type != rope_type_legacy: raise ValueError( f"Found conflicts between 'rope_type={rope_type}' (modern " @@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: "You should only specify one of them." ) - if "rope_type" not in rope_scaling and "type" in rope_scaling: - rope_scaling["rope_type"] = rope_scaling["type"] + if "rope_type" not in rope_parameters and "type" in rope_parameters: + rope_parameters["rope_type"] = rope_parameters["type"] logger.info("Replacing legacy 'type' key with 'rope_type'") - if "rope_type" not in rope_scaling: - raise ValueError("rope_scaling should have a 'rope_type' key") + if "rope_type" not in rope_parameters: + raise ValueError("rope_parameters should have a 'rope_type' key") - if rope_scaling["rope_type"] == "su": - rope_scaling["rope_type"] = "longrope" + if rope_parameters["rope_type"] == "su": + rope_parameters["rope_type"] = "longrope" logger.warning("Replacing legacy rope_type 'su' with 'longrope'") - elif rope_scaling["rope_type"] == "mrope": - assert "mrope_section" in rope_scaling - rope_scaling["rope_type"] = "default" + elif rope_parameters["rope_type"] == "mrope": + assert "mrope_section" in rope_parameters + rope_parameters["rope_type"] = "default" logger.warning("Replacing legacy rope_type 'mrope' with 'default'") def _uses_mrope(config: PretrainedConfig) -> bool: - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is None: + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is None: return False - return "mrope_section" in rope_scaling + return "mrope_section" in rope_parameters def uses_mrope(config: PretrainedConfig) -> bool: @@ -679,7 +722,14 @@ def get_config( logger.debug("Overriding HF config with %s", hf_overrides_fn) config = hf_overrides_fn(config) - patch_rope_scaling(config) + # Exhaustively patch RoPE parameters everywhere they might be + patch_rope_parameters(config) + patch_rope_parameters(config.get_text_config()) + SubConfigs: TypeAlias = dict[str, PretrainedConfig] + sub_configs: SubConfigs | None = getattr(config, "sub_configs", None) + if sub_configs: + for sub_config in sub_configs: + patch_rope_parameters(getattr(config, sub_config)) if trust_remote_code: maybe_register_config_serialize_by_value() diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py index 9b634fd037a3..47fee9882f9f 100644 --- a/vllm/transformers_utils/configs/afmoe.py +++ b/vllm/transformers_utils/configs/afmoe.py @@ -24,7 +24,7 @@ def __init__( rms_norm_eps: float = 1e-5, use_cache: bool = True, tie_word_embeddings: bool = False, - rope_theta: float = 10000.0, + rope_parameters: dict | None = None, rope_scaling: dict | None = None, num_experts: int = 64, num_experts_per_tok: int = 6, @@ -56,7 +56,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 10000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.rope_scaling = rope_scaling self.moe_intermediate_size = moe_intermediate_size diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 1707e15285c8..ba4b1a8f701f 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -132,7 +139,7 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters: dict[str, Any] | None = None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=1, @@ -165,7 +172,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 1e6) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py index 1f2f4d446288..c343dc0999a8 100644 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ b/vllm/transformers_utils/configs/flex_olmo.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -25,8 +26,7 @@ def __init__( bos_token_id=None, eos_token_id=100257, tie_word_embeddings=False, - rope_theta=500000.0, - rope_scaling=None, + rope_parameters: dict[str, Any] | None = None, attention_bias=False, attention_dropout=0.0, num_experts_per_tok=5, @@ -62,8 +62,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -73,5 +78,5 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Validate the correctness of rotary position embeddings parameters # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + if self.rope_parameters is not None and "type" in self.rope_parameters: + self.rope_parameters["rope_type"] = self.rope_parameters["type"] diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py index 65ddf48c5249..14894816801d 100644 --- a/vllm/transformers_utils/configs/kimi_linear.py +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -29,8 +29,7 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, tie_word_embeddings=False, moe_intermediate_size: int | None = None, moe_renormalize: bool = True, @@ -73,8 +72,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py index 37c038e12db8..b399a03c030f 100644 --- a/vllm/transformers_utils/configs/lfm2_moe.py +++ b/vllm/transformers_utils/configs/lfm2_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. max_position_embeddings (`int`, *optional*, defaults to 128000): The maximum sequence length that this model might ever be used with. use_cache (`bool`, *optional*, defaults to `True`): @@ -100,7 +101,7 @@ def __init__( bos_token_id: int = 1, eos_token_id: int = 2, tie_word_embeddings: bool = True, - rope_theta: float = 1000000.0, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 128_000, use_cache: bool = True, norm_eps: float = 0.00001, @@ -121,7 +122,10 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py index e49bd26b2b00..f1bbd057103e 100644 --- a/vllm/transformers_utils/configs/midashenglm.py +++ b/vllm/transformers_utils/configs/midashenglm.py @@ -98,6 +98,6 @@ def __init__( if text_config else Qwen2_5OmniTextConfig() ) - self.text_config.rope_scaling = None # uses_mrope is false + self.text_config.rope_parameters = None # uses_mrope is false self.audio_token_id = audio_token_id super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index c6f04febe37e..8f72f0b28b0d 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict: "apply_scale": "apply_yarn_scaling", } yarn_config = config.get("yarn") or {} - config["rope_scaling"] = { + config["rope_parameters"] = { "rope_type": "yarn", "mscale_all_dim": 1, } for old_name, new_name in yarn_config_map.items(): if old_name in yarn_config: - config["rope_scaling"][new_name] = yarn_config.pop(old_name) + config["rope_parameters"][new_name] = yarn_config.pop(old_name) assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}" diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 60eed549561f..d112c71d7d20 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): @@ -132,8 +132,7 @@ def __init__( bos_token_id=2, eos_token_id=3, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.5, attention_bias=False, attention_dropout=0.0, @@ -160,8 +159,13 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters # for backward compatibility partial_rotary_factor = ( kwargs.get("rope_percent") @@ -169,7 +173,7 @@ def __init__( or partial_rotary_factor ) self.partial_rotary_factor = partial_rotary_factor - self._rope_scaling_validation() + self._rope_parameters_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias @@ -182,31 +186,29 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - "`rope_scaling` must be a dictionary with two fields, " - f"`type` and `factor`, got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - "`rope_scaling`'s type field must be one of ['linear', " - f"'dynamic'], got {rope_scaling_type}" - ) - if ( - rope_scaling_factor is None - or not isinstance(rope_scaling_factor, float) - or rope_scaling_factor <= 1.0 - ): + rope_type: str | None = self.rope_parameters.get("rope_type", None) + factor: float | None = self.rope_parameters.get("factor", None) + + if rope_type not in {"default", "linear", "dynamic"}: raise ValueError( - "`rope_scaling`'s factor field must be a float > 1, got " - f"{rope_scaling_factor}" + "`rope_type` must be one of ['default', 'linear', 'dynamic'], " + f"got {rope_type}" ) + if rope_type != "default": + if factor is None: + raise ValueError( + "If `rope_type` is not 'default', `rope_parameters` " + "must include a `factor` field. Got `None`." + ) + if not isinstance(factor, float) or factor <= 1.0: + raise ValueError( + "`rope_parameters`'s factor field must be a float > 1, got " + f"{factor}" + ) diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py index f5a9a7cd36bd..c4691b661af3 100644 --- a/vllm/transformers_utils/configs/olmo3.py +++ b/vllm/transformers_utils/configs/olmo3.py @@ -24,8 +24,7 @@ def __init__( bos_token_id=None, eos_token_id=50279, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, rms_norm_eps=1e-5, @@ -63,8 +62,13 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index 21750bde2f87..d2fe58d48da6 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_parameters (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. `rope_type` (`str`): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. @@ -199,8 +198,7 @@ def __init__( rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.25, attention_bias=False, attention_dropout=0.0, @@ -236,8 +234,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py index 637b82d88e26..0ee650a70451 100644 --- a/vllm/transformers_utils/configs/step3_vl.py +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -52,8 +52,7 @@ def __init__( moe_intermediate_size: int = 5120, moe_num_experts: int = 48, moe_top_k: int = 3, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 65536, share_expert_dim: int = 5120, share_q_dim: int = 2048, @@ -130,8 +129,13 @@ def __init__( self.moe_intermediate_size = moe_intermediate_size self.moe_num_experts = moe_num_experts self.moe_top_k = moe_top_k - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.max_position_embedding = max_position_embedding self.share_expert_dim = share_expert_dim self.share_q_dim = share_q_dim