vllm-project · vllm-bot · Nov 19, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -874,12 +874,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
     # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 

diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
@@ -6,9 +6,9 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
-# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
-# speedup
+# rope_theta, is_neox_style, rope_parameters, dtype, torch_mean, torch_median,
+# torch_p99, torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min,
+# triton_max, speedup
 #
 # == Usage Examples ==
 #
@@ -88,7 +88,7 @@ def benchmark_mrope(
     max_position: int = 8192,
     rope_theta: float = 10000,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] = None,
+    rope_parameters: dict[str, Any] = None,
     dtype: torch.dtype = torch.bfloat16,
     seed: int = 0,
     warmup_iter: int = 10,
@@ -104,7 +104,7 @@ def benchmark_mrope(
         max_position=max_position,
         base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=rope_scaling,
+        rope_parameters=rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -205,7 +205,7 @@ def benchmark_mrope(
             max_position,
             rope_theta,
             is_neox_style,
-            str(rope_scaling),
+            str(rope_parameters),
             str(dtype).split(".")[-1],
             torch_stats["mean"],
             torch_stats["median"],
@@ -257,7 +257,7 @@ def benchmark_mrope(
             "max_position",
             "rope_theta",
             "is_neox_style",
-            "rope_scaling",
+            "rope_parameters",
             "dtype",
             "torch_mean",
             "torch_median",
@@ -303,7 +303,7 @@ def benchmark_mrope(
                 q_size = num_heads * head_dim
                 kv_size = num_kv_heads * head_dim
                 is_neox_style = True
-                rope_theta = config.rope_theta
+                rope_theta = config.rope_parameters["rope_theta"]
                 max_position = config.max_position_embeddings
 
                 for num_tokens in num_tokens_list:
@@ -317,7 +317,7 @@ def benchmark_mrope(
                         max_position=max_position,
                         rope_theta=rope_theta,
                         is_neox_style=is_neox_style,
-                        rope_scaling=config.rope_scaling,
+                        rope_parameters=config.rope_parameters,
                         dtype=getattr(torch, args.dtype),
                         seed=args.seed,
                         warmup_iter=args.warmup_iter,

diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This script demonstrates how to extend the context length
-of a Qwen model using the YARN method (rope_scaling)
+of a Qwen model using the YARN method (rope_parameters)
 and run a simple chat example.
 
 Usage:
@@ -20,7 +20,7 @@ def create_llm():
     # Use yarn to extend context
     hf_overrides = {
         "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,

@@ -5,11 +5,11 @@
 import pytest
 import torch
 from packaging.version import Version
-from transformers import AutoConfig
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -98,8 +98,7 @@ def test_mrope(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -113,7 +112,7 @@ def test_mrope(
     )
     is_neox_style = True
 
-    rope_theta = config.rope_theta
+    rope_theta = config.rope_parameters["rope_theta"]
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -124,7 +123,7 @@ def test_mrope(
         max_position=max_position,
         base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -173,8 +172,7 @@ def test_mrope_torch_compile_tracing(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -187,7 +185,7 @@ def test_mrope_torch_compile_tracing(
         else config.hidden_size // total_num_heads
     )
     is_neox_style = True
-    rope_theta = config.rope_theta
+    rope_theta = config.rope_parameters["rope_theta"]
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -198,7 +196,7 @@ def test_mrope_torch_compile_tracing(
         max_position=max_position,
         base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 

@@ -121,7 +121,7 @@ def test_rotary_embedding(
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
     BASES = [10000, 1000000]
-    ROPE_SCALINGS = (
+    ROPE_PARAMETERS = (
         None,
         {"rope_type": "linear", "factor": (1,)},
         {"rope_type": "dynamic", "factor": 1},
@@ -132,7 +132,7 @@ def test_rope_module_cache():
         MAX_POSITIONS,
         BASES,
         IS_NEOX_STYLE,
-        ROPE_SCALINGS,
+        ROPE_PARAMETERS,
         DTYPES,
     )
     rope_setting_id_map: dict[str, int] = {}
@@ -142,8 +142,8 @@ def test_rope_module_cache():
             rotary_dim,
             max_position,
             base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
@@ -153,8 +153,8 @@ def test_rope_module_cache():
             rotary_dim,
             max_position,
             base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # different settings cannot share the same rope module
@@ -169,8 +169,8 @@ def test_rope_module_cache():
             rotary_dim,
             max_position,
             base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
@@ -180,8 +180,8 @@ def test_rope_module_cache():
             rotary_dim,
             max_position,
             base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # check if cache take effect

@@ -201,7 +201,7 @@ class ModelConfig:
     sliding_window: int = 128
     initial_context_length: int = 4096
     rope_theta: float = 150000.0
-    rope_scaling_factor: float = 32.0
+    rope_parameters_factor: float = 32.0
     rope_ntk_alpha: float = 1.0
     rope_ntk_beta: float = 32.0
 

@@ -80,7 +80,7 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
 def test_use_rope_scaling_legal(model_info, vllm_runner):
     hf_overrides = {
         "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -98,7 +98,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
 def test_use_rope_scaling_illegal(model_info, vllm_runner):
     hf_overrides = {
         "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -116,7 +116,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
 
     hf_overrides = {
         "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -254,39 +254,40 @@ def test_rope_customization():
     LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
-    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
+    assert getattr(llama_model_config.hf_config, "rope_parameters", None) is None
     assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
         hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_parameters": TEST_ROPE_SCALING,
             "rope_theta": TEST_ROPE_THETA,
         },
     )
     assert (
-        getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_SCALING
     )
     assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
     # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
     assert all(
-        longchat_model_config.hf_config.rope_scaling.get(key) == value
+        longchat_model_config.hf_config.rope_parameters.get(key) == value
         for key, value in LONGCHAT_ROPE_SCALING.items()
     )
     assert longchat_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
         "lmsys/longchat-13b-16k",
         hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_parameters": TEST_ROPE_SCALING,
         },
     )
     assert (
-        getattr(longchat_model_config.hf_config, "rope_scaling", None)
+        getattr(longchat_model_config.hf_config, "rope_parameters", None)
         == TEST_ROPE_SCALING
     )
     assert longchat_model_config.max_model_len == 4096

@@ -13,6 +13,7 @@
 from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
@@ -2068,30 +2069,34 @@ def _get_and_verify_max_len(
         )
         derived_max_model_len = default_max_len
 
-    rope_scaling = getattr(hf_config, "rope_scaling", None)
+    rope_parameters = getattr(hf_config, "rope_parameters", None)
     # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
     # scaling, so we skip applying the scaling factor again.
-    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
-        # No need to consider "type" key because of patch_rope_scaling when
-        # loading HF config
-        rope_type = rope_scaling["rope_type"]
-
-        if rope_type not in ("su", "longrope", "llama3"):
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that supports rope_scaling
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "with rope_scaling. Please raise an issue so we can "
-                    "investigate."
-                )
+    if rope_parameters is not None and "gemma3" not in hf_config.model_type:
+        # In Transformers v5 this could be RopeParameters or dict[str, RopeParameters]
+        # To simplify, we convert any RopeParameters to dict[str, RopeParameters]
+        if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+            rope_parameters = {"": rope_parameters}
+        for rp in rope_parameters.values():
+            rope_type = rp["rope_type"]
+            scaling_factor = 1.0
+
+            if rope_type not in ("su", "longrope", "llama3"):
+                if disable_sliding_window:
+                    # TODO(robertgshaw): Find a model that supports rope_parameters
+                    # with sliding window to see if this case should be allowed.
+                    raise NotImplementedError(
+                        "Disabling sliding window is not supported for models with "
+                        "rope_parameters. Please raise an issue so we can investigate."
+                    )
 
-            # NOTE: rope_type == "default" does not define factor
-            # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
-            scaling_factor = rope_scaling.get("factor", 1.0)
+                # NOTE: rope_type == "default" does not define factor
+                # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
+                scaling_factor = rp.get("factor", 1.0)
 
-            if rope_type == "yarn":
-                derived_max_model_len = rope_scaling["original_max_position_embeddings"]
+                if rope_type == "yarn":
+                    derived_max_model_len = rp["original_max_position_embeddings"]
+            # Do this outside loop since all layers should have the same scaling
             derived_max_model_len *= scaling_factor
 
     if encoder_config and "max_seq_length" in encoder_config:
@@ -2102,7 +2107,9 @@ def _get_and_verify_max_len(
     if max_model_len is None:
         # For LongRoPE, default to original_max_position_embeddings to avoid
         # performance degradation for shorter sequences
-        if rope_scaling is not None and rope_scaling["rope_type"] == "longrope":
+        if rope_parameters is not None and any(
+            rp["rope_type"] == "longrope" for rp in rope_parameters.values()
+        ):
             max_model_len = int(
                 getattr(
                     hf_config, "original_max_position_embeddings", derived_max_model_len