ggml-org · esemsc-ss2524 · Dec 8, 2025 · Dec 8, 2025 · Dec 9, 2025 · Dec 9, 2025
@@ -6003,7 +6003,75 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [] # skip other tensors
 
 
-@ModelBase.register("Gemma3nForConditionalGeneration")
+@ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel")
+class Gemma3nVisionModel(MmprojModel):
+    """Vision encoder converter for Gemma3n using MobileNetV5 architecture"""
+
+    # MobileNetV5 doesn't have transformer layers, so we don't need block count
+    # Set n_block_keys to empty list to skip the find_hparam check
+    n_block_keys = []
+
+    def find_hparam(self, keys: list[str], optional: bool = False) -> Any:
+        """Override to return 0 for block count since MobileNetV5 is CNN-based"""
+        if not keys:  # If n_block_keys is empty (our case)
+            return 0
+        # Otherwise use parent implementation
+        return super().find_hparam(keys, optional)
+
+    def __init__(self, *args, **kwargs):
+        # Parent init will call find_hparam which now returns 0 for empty keys
+        super().__init__(*args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+
+        # Set projector type to GEMMA3N
+        self.gguf_writer.add_clip_projector_type(gguf.VISION_PROJECTOR_TYPE.GEMMA3N)
+
+        # MobileNetV5 specific parameters
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)  # MobileNetV5 uses approximate GELU
+
+        # Image sequence length (256 tokens = 16x16 for Gemma3n)
+        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+        # Note: Additional metadata can be added as needed
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # Force quantization settings for specific tensor types
+        if "input_projection" in name or "input_proj" in name:
+            return gguf.GGMLQuantizationType.F16
+        if ".embeddings." in name or "stem" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # Skip non-vision tensors
+        if not (name.startswith("multi_modal_projector.") or
+                name.startswith("vision_tower.") or
+                name.startswith("multimodal_projector.") or
+                name.startswith("vision_model.")):
+            return []
+
+        # Process MobileNetV5 and projection tensors
+        name = name.replace("_weight", ".weight")
+
+        # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed
+        # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1
+        # Only apply correction if explicitly needed based on the norm type
+        if "soft_emb_norm.weight" in name:
+            # For Gemma3n, typically no correction needed, but check model version
+            # If the model uses Gemma3RMSNorm style, uncomment below:
+            # logger.info(f"Correcting norm value for '{name}'")
+            # data_torch = data_torch + 1
+            pass
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@ModelBase.register("Gemma3nForCausalLM")
 class Gemma3NModel(Gemma3Model):
     model_arch = gguf.MODEL_ARCH.GEMMA3N
     norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code

diff --git a/docs/multimodal/gemma3n.md b/docs/multimodal/gemma3n.md
@@ -0,0 +1,112 @@
+# Gemma 3n Vision
+
+> [!IMPORTANT]
+>
+> This is very experimental, only used for demo purpose.
+
+## Overview
+
+Gemma 3n is an advanced multimodal model that uses the **MobileNetV5** vision encoder architecture (instead of SigLIP used in Gemma 3). The MobileNetV5 encoder provides efficient CNN-based feature extraction with Multi-Scale Fusion Adapter (MSFA) for combining features at different resolutions.
+
+## Architecture Differences
+
+- **Gemma 3**: Uses SigLIP vision encoder (Vision Transformer)
+- **Gemma 3n**: Uses MobileNetV5 vision encoder (CNN with MSFA)
+
+Both models share the same projection mechanism to the language model embedding space.
+
+## Quick Start
+
+You can use pre-quantized models from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account (when available)
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-mtmd-cli
+
+# alternatively, install from brew (MacOS)
+brew install llama.cpp
+
+# run it (example - update with actual model names when available)
+llama-mtmd-cli -hf ggml-org/gemma-3n-VARIANT-GGUF
+```
+
+## How to get mmproj.gguf?
+
+Simply add `--mmproj` when converting the model via `convert_hf_to_gguf.py`:
+
+```bash
+cd gemma-3n-model-directory
+python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
+# output file: mmproj-model.gguf
+```
+
+## How to run it?
+
+What you need:
+- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
+- The mmproj file from step above (contains MobileNetV5 vision encoder)
+- An image file
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-mtmd-cli
+
+# run it
+./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj-model.gguf --image your_image.jpg
+```
+
+## Model Conversion Details
+
+The conversion process handles:
+1. **Text Model**: Standard Gemma 3n language model weights
+2. **Vision Encoder**: MobileNetV5 architecture with:
+   - Stem convolution layer
+   - Multiple inverted residual blocks
+   - Multi-Query Attention (MQA) blocks
+   - Multi-Scale Fusion Adapter (MSFA)
+3. **Projection Layers**: RMSNorm and linear projection to language model space
+
+### Image Processing
+
+- **Input Resolution**: Depends on model configuration (typically 384x384 or similar)
+- **Output Tokens**: 256 soft tokens (16×16 grid)
+- **Preprocessing**: Normalization based on model metadata
+
+## Technical Implementation
+
+### MobileNetV5 Components
+
+1. **Inverted Residual Blocks**: Expansion → Depthwise Conv → Squeeze-Excitation → Projection
+2. **RMSNorm2d**: 2D RMS normalization for feature maps
+3. **Approximate GELU**: Activation function throughout the network
+4. **MSFA**: Combines features from multiple scales for robust representation
+
+### Integration with Language Model
+
+The vision encoder outputs are processed through:
+1. RMSNorm normalization
+2. Soft embedding normalization
+3. Linear projection to language model embedding dimension
+4. Non-causal attention during vision processing
+
+## Notes
+
+- Gemma 3n uses `Gemma3p5RMSNorm` which has different normalization behavior than Gemma 3
+- The MobileNetV5 architecture is more efficient than Vision Transformers for certain use cases
+- Image tokens are processed with non-causal attention masks
+
+## Troubleshooting
+
+If you encounter issues:
+
+1. **Model Loading Errors**: Ensure you're using the correct `--mmproj` file that matches the text model version
+2. **Vision Encoder Not Found**: Make sure the mmproj file was generated with the `--mmproj` flag
+3. **Image Size Mismatches**: Check the model's expected input resolution in the preprocessor config
+
+## References
+
+- [MobileNetV5 Paper](https://github.com/huggingface/pytorch-image-models) (timm implementation)
+- [Gemma 3n Model Card](https://huggingface.co/transformers/models/gemma3n)
+- [llama.cpp Multimodal Documentation](../)
@@ -456,6 +456,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
     GLM_EDGE  = auto()
     MERGER    = auto()
     GEMMA3    = auto()
+    GEMMA3N   = auto()
     QWEN3VL   = auto()
     COGVLM    = auto()
 

@@ -43,6 +43,7 @@ Multimodal projector (`mmproj`) files are specific to each model architecture.
 
 For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
 - [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
+- Gemma 3n ; See the guide [here](../../docs/multimodal/gemma3n.md) - Uses MobileNetV5 vision encoder
 - SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
 - SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
 - [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint

@@ -142,6 +142,7 @@ enum projector_type {
     PROJECTOR_TYPE_QWEN2VL,
     PROJECTOR_TYPE_QWEN3VL,
     PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_GEMMA3N,
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
@@ -169,6 +170,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
     { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+    { PROJECTOR_TYPE_GEMMA3N,   "gemma3n"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
     { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},