Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_models/multimodal_language_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@ in the GitHub search bar.
| **Gemma 3 (Multimodal)** | `google/gemma-3-4b-it` | `gemma-it` | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context. |
| **Kimi-VL** (A3B) | `moonshotai/Kimi-VL-A3B-Instruct` | `kimi-vl` | Kimi-VL is a multimodal model that can understand and generate text from images. |
| **Mistral-Small-3.1-24B** | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
| **Phi-4-multimodal-instruct** | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. Currently, it supports only text and vision modalities in SGLang. |
| **Phi-4-multimodal-instruct** | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |
| **MiMo-VL** (7B) | `XiaomiMiMo/MiMo-VL-7B-RL` | `mimo-vl` | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |
1 change: 1 addition & 0 deletions python/sglang/srt/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,7 @@ def generate_chat_conv(
sep="<|end|>",
stop_str="<|end|>",
image_token="<|endoftext10|>",
audio_token="<|endoftext11|>",
)
)

Expand Down
4 changes: 4 additions & 0 deletions python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ class MultimodalDataItem:
# For gemma3n
input_features_mask: Optional[torch.Tensor] = None

# For phi4-mm
image_attention_mask: Optional[torch.Tensor] = None
audio_attention_mask: Optional[torch.Tensor] = None

@staticmethod
def is_empty_list(l):
if l is None:
Expand Down
41 changes: 39 additions & 2 deletions python/sglang/srt/models/phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.idefics2 import Idefics2VisionTransformer
from sglang.srt.models.llama import LlamaForCausalLM
from sglang.srt.models.phi4mm_audio import AudioEmbedding

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -420,16 +421,49 @@ def __init__(
model_dir=config._name_or_path,
)

if isinstance(config.embd_layer["audio_embd_layer"], dict):
embedding_config = {
"embedding_cls": config.embd_layer["audio_embd_layer"]["embedding_cls"],
**config.embd_layer["audio_embd_layer"],
}
else:
embedding_config = {"embedding_cls": config.embd_layer["embedding_cls"]}

self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)

def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
dtype = next(self.vision_encoder.parameters()).dtype
pixel_values = torch.cat([item.feature for item in items], dim=0).type(dtype)
image_attention_mask = torch.cat([item.image_emb_mask for item in items], dim=0)
image_attention_mask = torch.cat(
[item.image_attention_mask for item in items], dim=0
)
image_sizes = torch.cat([item.image_sizes for item in items], dim=0)
image_embeds = self.vision_encoder(
pixel_values, image_sizes, image_attention_mask
)
return torch.cat(image_embeds).type(dtype)

def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
# (e.g. multiple examples) and the second dim is the multi-audio dim
# (e.g. multiple audios in the same example)
embed_tokens_extend_param = next(self.embed_tokens_extend.parameters())
device = embed_tokens_extend_param.device
dtype = embed_tokens_extend_param.dtype
audio_embeds = [
self.embed_tokens_extend(
# item.feature: (num_audios_in_a_sequence, T, D)
# item.audio_attention_mask: (num_audios_in_a_sequence, T, D) BoolTensor or None
audio_features=item.feature.to(device).type(dtype),
audio_attention_mask=(
item.audio_attention_mask.to(device)
if item.audio_attention_mask is not None
else None
),
)
for item in items
]
return torch.cat(audio_embeds).type(dtype)

def forward(
self,
input_ids: torch.Tensor,
Expand All @@ -443,6 +477,7 @@ def forward(
language_model=self.language_model,
data_embedding_funcs={
Modality.IMAGE: self.get_image_feature,
Modality.AUDIO: self.get_audio_feature,
},
positions=positions,
)
Expand All @@ -464,6 +499,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
(".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
]
prefix_mapping = {
"model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
"model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
"model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
"model.embed_tokens_extend.image_embed.": "vision_encoder.",
"model.": "language_model.model.",
}
Expand All @@ -472,7 +510,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
"img_processor.encoder.layers.26",
"img_processor.head",
"img_processor.post_layernorm",
"audio",
]

def _should_skip(name: str) -> bool:
Expand Down
Loading