[model] feat: Add Apertus (#3295)

paolo328 · paolo328 · commit e980e99610b8 · 2025-09-13T10:03:58.000+08:00
Pre-release of Apertus from the Swiss AI Initiative Main modifications from Llama - xIELU Activation - QK-norm Associated Transformers PR huggingface/transformers#39381 Associated vLLM PR vllm-project/vllm#23068 Associated SGLang PR sgl-project/sglang#9774 GSM8K <img width="430" height="262" alt="image" src="https://github.com/user-attachments/assets/8b2d5188-834b-4a8c-828e-2d0aa2ccffed" /> <img width="436" height="266" alt="image" src="https://github.com/user-attachments/assets/57241a73-3150-474a-a4fb-222e33a0de08" />
diff --git a/tests/models/test_transformer.py b/tests/models/test_transformer.py
@@ -15,6 +15,7 @@
 import torch
 from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input
 from transformers import (
+    ApertusConfig,
     AutoModelForCausalLM,
     AutoModelForTokenClassification,
     GemmaConfig,
@@ -33,6 +34,7 @@
     MistralConfig(num_hidden_layers=1),
     GemmaConfig(num_hidden_layers=1),
     Qwen2Config(num_hidden_layers=1),
+    ApertusConfig(num_hidden_layers=1),
 ]
 
 
diff --git a/tests/models/test_transformers_ulysses.py b/tests/models/test_transformers_ulysses.py
@@ -18,7 +18,9 @@
 import pytest
 import torch
 import torch.distributed
+import transformers
 from flash_attn.bert_padding import index_first_axis, rearrange, unpad_input
+from packaging import version
 from torch.distributed import init_device_mesh
 from transformers import AutoModelForCausalLM, LlamaConfig, PretrainedConfig, Qwen2Config
 
@@ -46,7 +48,7 @@ class SequenceParallelConfig:
 
 
 def test_configs():
-    return [
+    configs = [
         SequenceParallelConfig(
             LlamaConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32), sp_size=8, is_valid=True
         ),
@@ -68,6 +70,19 @@ def test_configs():
         ),
     ]
 
+    if version.parse(transformers.__version__) >= version.parse("4.56.0"):
+        from transformers import ApertusConfig
+
+        configs.append(
+            SequenceParallelConfig(
+                ApertusConfig(num_hidden_layers=2, num_attention_heads=32, num_key_value_heads=32, hidden_size=4096),
+                sp_size=8,
+                is_valid=True,
+            )
+        )
+
+    return configs
+
 
 def sync_model_parameters_global(layer):
     # synchronize weights
diff --git a/tests/utils/test_flops_counter.py b/tests/utils/test_flops_counter.py
@@ -18,7 +18,7 @@
 
 from verl.utils.flops_counter import FlopsCounter
 
-VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text"}
+VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus"}
 
 
 class Config:
@@ -206,12 +206,30 @@ def __init__(self, config_dict):
         # total: 986195089686528 / 1e12 = 986.195089686528
         "expected_flops_tuple": (283517065887744 / 1e12, 986195089686528 / 1e12),
     },
+    "apertus": {
+        "config": {  # swiss-ai/Apertus-8B
+            "model_type": "apertus",
+            "vocab_size": 131072,
+            "hidden_size": 4096,
+            "intermediate_size": 21504,
+            "num_hidden_layers": 32,
+            "num_attention_heads": 32,
+            "num_key_value_heads": 32,
+            "hidden_act": "xielu",
+            # head_dim will be derived as 4096 / 32 = 128
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # Calculation for Apertus (hidden_act="xielu" -> MLP uses [k_mlp=2]*H*I params; qk_norm=True -> [k_qkn=2]*H):
+        # V=131072, H=4096, I=21504, L=32, k_mlp=2 (XIELU), k_qkn=2 (QK norm), S=6
+        # S*(2*V*H + L*(4*H**2 + k_mlp*H*I + k_qkn*H)) * (SUM[seqlen]) + 12*SUM[seqlen**2]*L*H
+        "expected_flops_tuple": (199154680725504 / 1e12, 732294071451648 / 1e12),
+    },
 }
 
 
 @pytest.mark.parametrize(
     "config_type",
-    ["llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text"],
+    ["llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus"],
 )
 def test_flops_counter(config_type: str):
     test_config = CONFIG[config_type]
diff --git a/verl/models/registry.py b/verl/models/registry.py
@@ -32,6 +32,10 @@
         "mistral",
         ("ParallelMistralForCausalLMRmPadPP", "ParallelMistralForValueRmPadPP", "ParallelMistralForCausalLMRmPad"),
     ),
+    "ApertusForCausalLM": (
+        "apertus",
+        ("ParallelApertusForCausalLMRmPadPP", "ParallelApertusForValueRmPadPP", "ParallelApertusForCausalLMRmPad"),
+    ),
 }
 
 
diff --git a/verl/models/transformers/apertus.py b/verl/models/transformers/apertus.py
@@ -0,0 +1,118 @@
+# Copyright 2025 The SwissAI Initiative
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import Callable, Optional
+
+import torch
+
+if sys.version_info >= (3, 11):
+    pass
+else:
+    pass
+
+from transformers.cache_utils import Cache
+from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb
+from transformers.utils import logging
+
+# Import compatibility wrapper for flash_attn_supports_top_left_mask
+from verl.utils.ulysses import (
+    gather_heads_scatter_seq,
+    gather_seq_scatter_heads,
+    get_ulysses_sequence_parallel_world_size,
+    validate_ulysses_config,
+)
+
+logger = logging.get_logger(__name__)
+
+
+def apertus_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    position_embeddings: tuple[torch.Tensor, torch.Tensor],
+    attention_mask: Optional[torch.Tensor],
+    past_key_value: Optional[Cache] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+    """
+    Adapted from transformers 4.49.0 to support Ulysses sequence parallelism for transformers >= 4.48.0.
+
+    Key differences from Llama attention:
+    - QK normalization applied after Q/K projections
+
+        NOTE: This function has been tested only on transformers versions between 4.48.0 and 4.50.0.
+    """
+    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+    from transformers.models.apertus.modeling_apertus import eager_attention_forward
+
+    bsz, q_len, _ = hidden_states.shape
+
+    query_states = self.q_proj(hidden_states).view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+    query_states = self.q_norm(query_states)
+    key_states = self.k_norm(key_states)
+
+    ########## AlltoAll for Ulysses ##########
+    ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
+
+    if ulysses_sp_size > 1:
+        validate_ulysses_config(self.config.num_attention_heads, ulysses_sp_size)
+
+        query_states = gather_seq_scatter_heads(query_states, seq_dim=2, head_dim=1)
+        key_states = gather_seq_scatter_heads(key_states, seq_dim=2, head_dim=1)
+        value_states = gather_seq_scatter_heads(value_states, seq_dim=2, head_dim=1)
+
+    full_q_len = query_states.size(2)
+
+    cos, sin = position_embeddings
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        # sin and cos are specific to RoPE models; cache_position needed for the static cache
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    attention_interface: Callable = eager_attention_forward
+    if self.config._attn_implementation != "eager":
+        if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+            logger.warning_once(
+                "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. "
+                "Falling back to eager attention. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+        else:
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+    attn_output, attn_weights = attention_interface(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        dropout=0.0 if not self.training else self.attention_dropout,
+        scaling=self.scaling,
+        **kwargs,
+    )
+
+    attn_output = attn_output.reshape(bsz, full_q_len, -1, self.head_dim).contiguous()
+    ########## AlltoAll for Ulysses ##########
+    if ulysses_sp_size > 1:
+        attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
+    attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights
diff --git a/verl/utils/flops_counter.py b/verl/utils/flops_counter.py
@@ -31,6 +31,7 @@
     "mistral",
     "gemma3_text",
     "seed_oss",
+    "apertus",
 }
 
 
@@ -132,6 +133,7 @@ def __init__(self, config: PretrainedConfig):
             "mistral": self._estimate_qwen2_flops,
             "gemma3_text": self._estimate_gemma3_flops,
             "seed_oss": self._estimate_qwen2_flops,
+            "apertus": self._estimate_apertus_flops,
         }
         self.config = config
 
@@ -329,6 +331,45 @@ def _estimate_gemma3_flops(self, tokens_sum, batch_seqlens, delta_time):
         flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
         return flops_achieved
 
+    def _estimate_apertus_flops(self, tokens_sum, batch_seqlens, delta_time):
+        hidden_size = self.config.hidden_size
+        vocab_size = self.config.vocab_size
+        num_hidden_layers = self.config.num_hidden_layers
+        num_key_value_heads = self.config.num_key_value_heads
+        num_attention_heads = self.config.num_attention_heads
+        intermediate_size = self.config.intermediate_size
+
+        head_dim = getattr(self.config, "head_dim", self.config.hidden_size // self.config.num_attention_heads)
+        q_size = num_attention_heads * head_dim
+        k_size = num_key_value_heads * head_dim
+        v_size = num_key_value_heads * head_dim
+
+        # Apertus MLP with XIELU activation uses only 2 linear layers (up_proj, down_proj)
+        # No gate_proj for XIELU, unlike SwiGLU which has 3 layers
+        mlp_N = hidden_size * intermediate_size * 2
+        attn_linear_N = hidden_size * (q_size + k_size + v_size + num_attention_heads * head_dim)
+
+        # ApertusConfig has qk_norm defaulting to True.
+        # This adds params for q_norm (on H) and k_norm (on num_kv_heads * head_dim)
+        qk_norm_params_per_layer = hidden_size + num_key_value_heads * head_dim  # q_norm + k_norm
+
+        emd_and_lm_head_N = vocab_size * hidden_size * 2
+        # non-attn all_layer params
+        dense_N = (mlp_N + attn_linear_N + qk_norm_params_per_layer) * num_hidden_layers + emd_and_lm_head_N
+        # non-attn all_layer & all_token fwd & bwd flops
+        dense_N_flops = 6 * dense_N * tokens_sum
+
+        # attn all_layer & all_token fwd & bwd flops
+        seqlen_square_sum = 0
+        for seqlen in batch_seqlens:
+            seqlen_square_sum += seqlen * seqlen
+        attn_qkv_flops = 12 * seqlen_square_sum * head_dim * num_attention_heads * num_hidden_layers
+
+        # all_layer & all_token fwd & bwd flops
+        flops_all_token = dense_N_flops + attn_qkv_flops
+        flops_achieved = flops_all_token * (1.0 / delta_time) / 1e12
+        return flops_achieved
+
     def estimate_flops(self, batch_seqlens, delta_time):
         """
         Estimate the FLOPS based on the number of valid tokens in the current batch and the time taken.

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,10 @@`
`32`	`32`	`"mistral",`
`33`	`33`	`("ParallelMistralForCausalLMRmPadPP", "ParallelMistralForValueRmPadPP", "ParallelMistralForCausalLMRmPad"),`
`34`	`34`	`),`
	`35`	`+ "ApertusForCausalLM": (`
	`36`	`+ "apertus",`
	`37`	`+ ("ParallelApertusForCausalLMRmPadPP", "ParallelApertusForValueRmPadPP", "ParallelApertusForCausalLMRmPad"),`
	`38`	`+ ),`
`35`	`39`	`}`
`36`	`40`
`37`	`41`