feat(rope): loosen RopeParameters typing

RyanMullins · RyanMullins · commit 7e7ab8bc4fa5 · 2025-11-03T13:48:48.000Z
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
@@ -13,13 +13,20 @@
 # limitations under the License.
 
 import math
+import sys
 from functools import wraps
 from typing import Optional, TypedDict
 
 from .configuration_utils import PreTrainedConfig
 from .utils import is_torch_available, logging
 
 
+if sys.version_info >= (3, 11):
+    from typing import Required
+else:
+    from typing_extensions import Required
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -885,14 +892,16 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set]
             )
 
 
-class RopeParameters(TypedDict):
+class RopeParameters(TypedDict, total=False):
     """
     Args:
         rope_theta (`float`):
             The base period of the RoPE embeddings.
-        rope_type (`str`, *optional*, defaults to "default"):
+        rope_type (`str`):
             The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-            'llama3'], with 'default' being the original RoPE implementation.
+            'llama3'], with 'default' being the original RoPE implementation. This value will be "default" if
+            constructed by `standardize_rope_params()` from a legacy config without a `rope_parameters` or
+            `rope_scaling` field that specifies this value.
         factor (`float`, *optional*):
             Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
             most scaling types, a `factor` of x will enable the model to handle sequences of length x *
@@ -924,8 +933,8 @@ class RopeParameters(TypedDict):
             Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
     """
 
-    rope_theta: float
-    rope_type: Optional[str]
+    rope_theta: Required[float]
+    rope_type: Required[str]
     factor: Optional[float]
     original_max_position_embeddings: Optional[int]
     attention_factor: Optional[float]
diff --git a/src/transformers/models/gemma3/convert_gemma3_weights.py b/src/transformers/models/gemma3/convert_gemma3_weights.py
@@ -44,6 +44,7 @@
     Gemma3TextModel,
     GemmaTokenizerFast,
     GenerationConfig,
+    RopeParameters,
     SiglipVisionConfig,
 )
 from transformers.image_utils import PILImageResampling
@@ -142,7 +143,10 @@
             max_position_embeddings=1024,
             query_pre_attn_scalar=256,
             sliding_window=512,
-            rope_parameters=None,
+            rope_parameters={
+                "full_attention": RopeParameters(rope_type="default", rope_theta=1_000_000.0),
+                "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0),
+            },
             use_bidirectional_attention=True,
         ),
         vision_config=None,
@@ -159,7 +163,10 @@
             max_position_embeddings=32768,
             query_pre_attn_scalar=256,
             sliding_window=512,
-            rope_parameters=None,
+            rope_parameters={
+                "full_attention": RopeParameters(rope_type="default", rope_theta=1_000_000.0),
+                "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0),
+            },
         ),
         vision_config=None,
     ),
@@ -173,8 +180,10 @@
             num_key_value_heads=1,
             head_dim=256,
             sliding_window=512,
-            rope_theta=1_000_000,  # used for global RoPE only
-            rope_local_base_freq=10_000,
+            rope_parameters={
+                "full_attention": RopeParameters(rope_type="default", rope_theta=1_000_000.0),
+                "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0),
+            },
             attn_logit_softcapping=None,
             query_pre_attn_scalar=256,
             max_position_embeddings=32_768,
@@ -192,11 +201,9 @@
             num_key_value_heads=4,
             sliding_window=1024,
             rope_parameters={
-                "full_attention": {"rope_type": "linear", "factor": 8.0},
-                "sliding_attention": {"rope_type": "default"},
+                "full_attention": RopeParameters(rope_type="linear", rope_theta=1_000_000.0, factor=8.0),
+                "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0),
             },
-            rope_theta=1_000_000,
-            rope_local_base_freq=10_000,
             attn_logit_softcapping=None,
             query_pre_attn_scalar=256,
         ),
@@ -213,11 +220,9 @@
             num_key_value_heads=8,
             sliding_window=1024,
             rope_parameters={
-                "full_attention": {"rope_type": "linear", "factor": 8.0},
-                "sliding_attention": {"rope_type": "default"},
+                "full_attention": RopeParameters(rope_type="linear", rope_theta=1_000_000.0, factor=8.0),
+                "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0),
             },
-            rope_theta=1_000_000,
-            rope_local_base_freq=10_000,
             attn_logit_softcapping=None,
             query_pre_attn_scalar=256,
         ),
@@ -234,11 +239,9 @@
             head_dim=128,
             sliding_window=1024,
             rope_parameters={
-                "full_attention": {"rope_type": "linear", "factor": 8.0},
-                "sliding_attention": {"rope_type": "default"},
+                "full_attention": RopeParameters(rope_type="linear", rope_theta=1_000_000.0, factor=8.0),
+                "sliding_attention": RopeParameters(rope_type="default", rope_theta=10_000.0),
             },
-            rope_theta=1_000_000,
-            rope_local_base_freq=10_000,
             attn_logit_softcapping=None,
             query_pre_attn_scalar=(42 * 128 // 32),  # 1 / sqrt(hidden_size // num_attention_heads)
         ),