Gemma3n audio model (#18)

pculliton · RyanMullins · RyanMullins · commit 7d1478847b86 · 2025-06-17T19:26:30.000Z
* testing utilities for numerics comparisons

* Implement CumulativeGroupNorm and add to SubSampleConvProjection and SSCPConvBlock

* Add audio version of forward script based on RyanMullins' implementation

* Updating to match encoder tests. WIP: config question needs resolving

* Updates to audio classes to enable end-to-end running

* Removing vestigial classes, cleaning up print statements

* Adding SiLU / Swish to audio conformer feed forward block

* Shifted Gemma3p5Audio naming prefix to Gemma3NanoAudio

* Adding outputs to audio test

* Fixes to padding in SSCP and 1D convolution, align RMS Norm with wider model

* Update forward test to load from local weights

* Update conversion to process / output audio layers

* Update __all__ to export audio encoder

* AutoModel registration for Gemma 3n Audio

* Use AutoModel for ConditionalGeneration.audio_tower

* Fixing input_proj_linear transpose

* Fixing Gemma3NanoAudioConformerAttention.post conversion

* Fixing Gemma3NanoAudioSSCPConvBlock.conv weights conversion

* Correcting indentation issue on Gemma3p5RMSNorm

---------

Co-authored-by: Ryan Mullins &lt;ryanmullins@google.com&gt;
diff --git a/gemma3n_audio_forward_test.py b/gemma3n_audio_forward_test.py
@@ -0,0 +1,45 @@
+import numpy as np
+import torch
+import transformers
+from transformers import (
+   GemmaTokenizer,
+   Gemma3p5ForCausalLM,
+   model_addition_debugger_context,
+   Gemma3NanoAudioConfig,
+   Gemma3NanoAudioEncoder,
+)
+
+from transformers.models.gemma3p5.modeling_gemma3p5 import Gemma3NanoAudioEncoder
+
+model_id = "gg-hf-gm/gemma-3p5-audio-encoder"
+# model = Gemma3NanoAudioEncoder.from_pretrained(model_id, input_feat_size=128)
+model = Gemma3NanoAudioEncoder.from_pretrained("/usr/local/google/home/philculliton/gemma3p5/checkpoints/4b_it_safetensors/")
+audio_config = model.config
+
+print (audio_config)
+
+batch_size = 1
+seq_len = 80  # Example input sequence length (make it odd to test padding)
+pad_len = 40
+
+print ("audio_config.input_feat_size", audio_config.input_feat_size)
+
+rng = np.random.default_rng(seed=42)
+audio_mel = rng.normal(size=(batch_size, seq_len, audio_config.input_feat_size)).astype(np.float32)
+print ("audio_mel", audio_mel.shape)
+audio_mel_mask_np = np.zeros((batch_size, seq_len), dtype=bool)
+if seq_len >= pad_len:  # Ensure pad_len is not out of bounds
+    audio_mel_mask_np[:, -pad_len:] = True  # Pad the end
+
+with model_addition_debugger_context(
+    model=model,
+    debug_path="/usr/local/google/home/philculliton/nano3/gemma3n_audio_encoder_debug",
+    do_prune_layers=False,
+    use_repr=False,
+):
+    print(audio_mel, audio_mel_mask_np)
+
+    outputs = model.forward(torch.from_numpy(audio_mel), torch.from_numpy(audio_mel_mask_np))
+
+    print (outputs)
+    print("Sum: ", np.sum(outputs[0].numpy()))
diff --git a/gemma3n_forward_test.py b/gemma3n_forward_test.py
@@ -8,8 +8,6 @@
     model_addition_debugger_context
 )
 
-from transformers.models.gemma3p5.modeling_gemma3p5 import Gemma3p5AudioEncoder
-
 model_id = "/usr/local/google/home/ryanmullins/nano3/checkpoints/g251_safetensors"
 
 tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -138,6 +138,7 @@
         ("gemma3", "Gemma3Config"),
         ("gemma3p5", "Gemma3p5Config"),
         ("gemma3_text", "Gemma3TextConfig"),
+        ("gemma3p5_audio", "Gemma3NanoAudioConfig"),
         ("gemma3p5_text", "Gemma3p5TextConfig"),
         ("gemma3p5_vision", "Gemma3p5VisionConfig"),
         ("git", "GitConfig"),
@@ -511,6 +512,7 @@
         ("gemma3", "Gemma3ForConditionalGeneration"),
         ("gemma3p5", "Gemma3p5ForConditionalGeneration"),
         ("gemma3_text", "Gemma3ForCausalLM"),
+        ("gemma3p5_audio", "Gemma3NanoAudioEncoder"),
         ("gemma3p5_text", "Gemma3p5ForCausalLM"),
         ("gemma3p5_vision", "TimmWrapperModel"),
         ("git", "GIT"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -129,6 +129,7 @@
         ("gemma2", "Gemma2Model"),
         ("gemma3", "Gemma3Model"),
         ("gemma3_text", "Gemma3TextModel"),
+        ("gemma3p5_audio", "Gemma3NanoAudioEncoder"),
         ("gemma3p5_text", "Gemma3p5TextModel"),
         ("gemma3p5_vision", "TimmWrapperModel"),
         ("git", "GitModel"),
diff --git a/src/transformers/models/gemma3p5/configuration_gemma3p5.py b/src/transformers/models/gemma3p5/configuration_gemma3p5.py
@@ -271,12 +271,12 @@ def __init__(
         self.activation_sparsity_pattern = activation_sparsity_pattern
 
 
-class Gemma3p5AudioConfig(PretrainedConfig):
+class Gemma3NanoAudioConfig(PretrainedConfig):
     model_type = "gemma3p5_audio"
 
     def __init__(
         self,
-        input_feat_size: int = 80,
+        input_feat_size: int = 128,
         hidden_size: int = 1536,
         embedding_norm_eps: float = 1e-6,
         vocab_size: int = 128,
@@ -458,7 +458,7 @@ class Gemma3p5Config(PretrainedConfig):
     >>> vision_config = AutoConfig.from_pretrained(checkpoint)
 
     >>> # Initializing a Gemma3p5 Audio config
-    >>> audio_config = Gemma3p5AudioConfig()
+    >>> audio_config = Gemma3NanoAudioConfig()
 
     >>> # Initializing a Gemma3p5 Text config
     >>> text_config = Gemma3p5TextConfig()
@@ -477,14 +477,14 @@ class Gemma3p5Config(PretrainedConfig):
     sub_configs = {
         "text_config": Gemma3p5TextConfig,
         "vision_config": Gemma3p5VisionConfig,
-        "audio_config": Gemma3p5AudioConfig,
+        "audio_config": Gemma3NanoAudioConfig,
     }
 
     def __init__(
         self,
         text_config: Optional[Union[Gemma3p5TextConfig, dict[str, Any]]] = None,
         vision_config: Optional[Union[Gemma3p5VisionConfig, dict[str, Any]]] = None,
-        audio_config: Optional[Union[Gemma3p5AudioConfig, dict[str, Any]]] = None,
+        audio_config: Optional[Union[Gemma3NanoAudioConfig, dict[str, Any]]] = None,
         audio_soft_tokens_per_image: int = 256,
         vision_soft_tokens_per_image: int = 256,
         boi_token_id: int = 255_999,
@@ -511,10 +511,10 @@ def __init__(
             logger.info("vision_config is None. Using default Gemma3p5VisionConfig.")
 
         if isinstance(audio_config, dict):
-            audio_config = Gemma3p5AudioConfig(**audio_config)
+            audio_config = Gemma3NanoAudioConfig(**audio_config)
         elif audio_config is None:
-            audio_config = Gemma3p5AudioConfig()
-            logger.info("audio_config is None. Using default Gemma3p5AudioConfig.")
+            audio_config = Gemma3NanoAudioConfig()
+            logger.info("audio_config is None. Using default Gemma3NanoAudioConfig.")
 
         self.text_config = text_config
         self.vision_config = vision_config
@@ -531,4 +531,4 @@ def __init__(
         self.initializer_range = initializer_range
 
 
-__all__ = ["Gemma3p5Config", "Gemma3p5AudioConfig", "Gemma3p5TextConfig", "Gemma3p5VisionConfig"]
+__all__ = ["Gemma3p5Config", "Gemma3NanoAudioConfig", "Gemma3p5TextConfig", "Gemma3p5VisionConfig"]
diff --git a/src/transformers/models/gemma3p5/convert_gemma3p5_weights.py b/src/transformers/models/gemma3p5/convert_gemma3p5_weights.py
@@ -39,12 +39,12 @@
 from transformers import (
     AutoConfig,
     Gemma3p5Config,
-    # Gemma3p5AudioEncoder,
     Gemma3p5ForCausalLM,
     Gemma3p5ForConditionalGeneration,
     Gemma3ImageProcessor,
     Gemma3Processor,
-    Gemma3p5AudioConfig,
+    Gemma3NanoAudioConfig,
+    Gemma3NanoAudioEncoder,
     Gemma3p5TextConfig,
     Gemma3p5VisionConfig,
     GemmaTokenizerFast,
@@ -155,12 +155,12 @@
             activation_sparsity_pattern=(0.95,)*10 + (0.0,)*20,
         ),
         vision_config=Gemma3p5VisionConfig(),
-        audio_config=Gemma3p5AudioConfig(),
+        audio_config=Gemma3NanoAudioConfig(),
     ),
     _VARIANT_GEMMA_3_4B: Gemma3p5Config(
         text_config=Gemma3p5TextConfig(),
         vision_config=Gemma3p5VisionConfig(),
-        audio_config=Gemma3p5AudioConfig(),
+        audio_config=Gemma3NanoAudioConfig(),
     ),
 }
 
@@ -228,7 +228,7 @@
 
 
 def convert_audio_encoder_weights(
-    config: Gemma3p5AudioConfig,
+    config: Gemma3NanoAudioConfig,
     path: str,
     param: str,
     weights: np.ndarray,
@@ -242,7 +242,7 @@ def convert_audio_encoder_weights(
 
         for i, matrix in enumerate(weights):
             if "fflayer_end" in path:
-                base = f"audio_tower.conformer.{i}.ffw_layer_end"
+                base = f"conformer.{i}.ffw_layer_end"
 
                 if path.endswith("ffn_layer1"):
                     converted_paths.append(f"{base}.ffw_layer_1.weight")
@@ -257,7 +257,7 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.pre_layer_norm.weight")
                     converted_weights.append(matrix)
             elif "fflayer_start" in path:
-                base = f"audio_tower.conformer.{i}.ffw_layer_start"
+                base = f"conformer.{i}.ffw_layer_start"
 
                 if path.endswith("ffn_layer1"):
                     converted_paths.append(f"{base}.ffw_layer_1.weight")
@@ -272,10 +272,10 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.pre_layer_norm.weight")
                     converted_weights.append(matrix)
             elif path.endswith("final_ln"):
-                converted_paths.append(f"audio_tower.conformer.{i}.norm.weight")
+                converted_paths.append(f"conformer.{i}.norm.weight")
                 converted_weights.append(matrix)
             elif "lconv" in path:
-                base = f"audio_tower.conformer.{i}.lconv1d"
+                base = f"conformer.{i}.lconv1d"
 
                 if path.endswith("conv_norm"):
                     converted_paths.append(f"{base}.conv_norm.weight")
@@ -293,7 +293,7 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.pre_layer_norm.weight")
                     converted_weights.append(matrix)
             elif "trans_atten" in path:
-                base = f"audio_tower.conformer.{i}.attention"
+                base = f"conformer.{i}.attention"
 
                 if param == "per_dim_scale":
                     converted_paths.append(f"{base}.attn.per_dim_scale")
@@ -312,7 +312,7 @@ def convert_audio_encoder_weights(
                     converted_weights.append(matrix.reshape(config.hidden_size, config.hidden_size).transpose())
                 elif path.endswith("post"):
                     converted_paths.append(f"{base}.post.weight")
-                    converted_weights.append(matrix.transpose(1, 2, 0).reshape(config.hidden_size, config.hidden_size))
+                    converted_weights.append(matrix.transpose(2, 0, 1).reshape(config.hidden_size, config.hidden_size))
                 elif path.endswith("post_norm"):
                     converted_paths.append(f"{base}.post_norm.weight")
                     converted_weights.append(matrix)
@@ -321,21 +321,18 @@ def convert_audio_encoder_weights(
                     converted_weights.append(matrix)
     elif path.startswith(_AUDIO_ENCODER_SSCP):
         if path.endswith("input_proj"):
-            converted_paths.append(f"audio_tower.subsample_conv_projection.input_proj_linear.weight")
+            converted_paths.append(f"subsample_conv_projection.input_proj_linear.weight")
             converted_weights.append(
-                weights.transpose(1, 2, 0).reshape(config.hidden_size, config.sscp_conv_channel_size[1] ** 2)
+                weights.transpose(2, 0, 1).reshape(config.hidden_size, config.sscp_conv_channel_size[1] ** 2)
             )
         elif "norm_" in path:
             index = int(path[-1])
-            converted_paths.extend([
-                f"audio_tower.subsample_conv_projection.conv_{index}.norm.bias",
-                f"audio_tower.subsample_conv_projection.conv_{index}.norm.weight",
-            ])
-            converted_weights.extend([np.zeros_like(weights), weights])
+            converted_paths.append(f"subsample_conv_projection.conv_{index}.norm.weight")
+            converted_weights.append(weights)
         elif "subsampling_" in path:
             index = int(path[-1])
-            converted_paths.append(f"audio_tower.subsample_conv_projection.conv_{index}.conv.weight")
-            converted_weights.append(weights.transpose())
+            converted_paths.append(f"subsample_conv_projection.conv_{index}.conv.weight")
+            converted_weights.append(weights.transpose(3, 2, 0, 1))
 
     if (cpl := len(converted_paths)) != (cwl := len(converted_weights)):
         raise ValueError(
@@ -649,7 +646,7 @@ def update_tree(path: str, weights: np.ndarray, target_dtype: torch.dtype) -> No
             update_tree(
                 "embed_vision.embedding_projection.weight", value.transpose(), config.vision_config.torch_dtype
             )
-        elif path.startswith(_TRANSFORMER_PARAMETER):
+        if path.startswith(_TRANSFORMER_PARAMETER):
             for path, weights in convert_transformer_weights(config.text_config, path, param, value):
                 update_tree(f"language_model.{path}", weights, config.text_config.torch_dtype)
         elif _MOBILE_NET_PREFIX in path:
@@ -659,7 +656,7 @@ def update_tree(path: str, weights: np.ndarray, target_dtype: torch.dtype) -> No
                 update_tree(f"vision_tower.timm_model.{path}", weights, config.vision_config.torch_dtype)
         elif path.startswith(_AUDIO_ENCODER_PARAMETER):
             for path, weights in convert_audio_encoder_weights(config.audio_config, path, param, value):
-                update_tree(path, weights, config.audio_config.torch_dtype)
+                update_tree(f"audio_tower.{path}", weights, config.audio_config.torch_dtype)
 
     hf_tree["language_model.lm_head.weight"] = hf_tree["language_model.model.embed_tokens.weight"]
 
@@ -700,7 +697,7 @@ def main(*args):
         variant,
         type(model).__name__,
     )
-    model.save_pretrained(output_path, safe_serialization=True)
+    model.save_pretrained(output_path, state_dict=state_tree, safe_serialization=True)
     logging.info(
         "Saved Gemma 3 (%s) to SafeTensors in %s using %s",
         variant,
diff --git a/src/transformers/models/gemma3p5/modeling_gemma3p5.py b/src/transformers/models/gemma3p5/modeling_gemma3p5.py
diff --git a/src/transformers/models/gemma3p5/modular_gemma3p5.py b/src/transformers/models/gemma3p5/modular_gemma3p5.py

Original file line number	Diff line number	Diff line change
`@@ -8,8 +8,6 @@`
`8`	`8`	`model_addition_debugger_context`
`9`	`9`	`)`
`10`	`10`
`11`		`-from transformers.models.gemma3p5.modeling_gemma3p5 import Gemma3p5AudioEncoder`
`12`		`-`
`13`	`11`	`model_id = "/usr/local/google/home/ryanmullins/nano3/checkpoints/g251_safetensors"`
`14`	`12`
`15`	`13`	`tokenizer = AutoTokenizer.from_pretrained(model_id)`