From dfebf518d6aeb8981585257234f055c35f7891f5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 8 Apr 2025 03:18:24 +0000 Subject: [PATCH 01/45] [Bugfix] Merge multimodal embeddings by `is_embed` mask instead of token ID Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 20 +++++++++++++++----- vllm/v1/worker/tpu_model_runner.py | 19 +++++++++++++++---- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a83409a72a88..33c7e5951d20 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -1027,12 +1028,21 @@ def execute_model( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - input_ids = self.input_ids[:num_scheduled_tokens] + + # TODO: Apply vllm-project/vllm#16007 + # so that we don't have to define `get_input_embeddings` in the + # multi-modal model once V0 is deprecated + inputs_embeds = self.model.get_input_embeddings(self.input_ids) + if mm_embeds: - inputs_embeds = self.model.get_input_embeddings( - input_ids, mm_embeds) - else: - inputs_embeds = self.model.get_input_embeddings(input_ids) + is_embed: torch.Tensor = ... # TODO + + _merge_multimodal_embeddings( + inputs_embeds, + is_embed, + mm_embeds, + ) + # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) inputs_embeds = self.inputs_embeds[:num_input_tokens] diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 4b058122f0ce..ffedfeffa549 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -19,6 +19,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -667,11 +668,21 @@ def execute_model( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. + + # TODO: Apply vllm-project/vllm#16007 + # so that we don't have to define `get_input_embeddings` in the + # multi-modal model once V0 is deprecated + inputs_embeds = self.model.get_input_embeddings(self.input_ids) + if mm_embeds: - inputs_embeds = self.model.get_input_embeddings( - self.input_ids, mm_embeds) - else: - inputs_embeds = self.model.get_input_embeddings(self.input_ids) + is_embed: torch.Tensor = ... # TODO + + _merge_multimodal_embeddings( + inputs_embeds, + is_embed, + mm_embeds, + ) + input_ids = None else: # For text-only models, we use token ids as input. From 437dacde337410a195ff6cfa9f27f9c88782ce4f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 8 Apr 2025 03:44:17 +0000 Subject: [PATCH 02/45] Rename Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- vllm/v1/worker/tpu_model_runner.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33c7e5951d20..1a8a7e87e932 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1035,11 +1035,11 @@ def execute_model( inputs_embeds = self.model.get_input_embeddings(self.input_ids) if mm_embeds: - is_embed: torch.Tensor = ... # TODO + is_mm_embeds: torch.Tensor = ... # TODO _merge_multimodal_embeddings( inputs_embeds, - is_embed, + is_mm_embeds, mm_embeds, ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index ffedfeffa549..4c08912ec9a5 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -675,11 +675,11 @@ def execute_model( inputs_embeds = self.model.get_input_embeddings(self.input_ids) if mm_embeds: - is_embed: torch.Tensor = ... # TODO + is_mm_embeds: torch.Tensor = ... # TODO _merge_multimodal_embeddings( inputs_embeds, - is_embed, + is_mm_embeds, mm_embeds, ) From 57e9f030d1cbe3ffcaab718d461d2475037a7550 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 9 Apr 2025 11:25:34 +0000 Subject: [PATCH 03/45] Use #16007 Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 7 ++----- vllm/v1/worker/tpu_model_runner.py | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 91d216ca8900..d55e4fe3ee95 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1028,11 +1028,8 @@ def execute_model( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - - # TODO: Apply vllm-project/vllm#16007 - # so that we don't have to define `get_input_embeddings` in the - # multi-modal model once V0 is deprecated - inputs_embeds = self.model.get_input_embeddings(self.input_ids) + language_model = self.model.get_language_model() + inputs_embeds = language_model.get_input_embeddings(self.input_ids) if mm_embeds: is_mm_embeds: torch.Tensor = ... # TODO diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b0f51ab30455..f5259e0b3ce9 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -669,11 +669,8 @@ def execute_model( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - - # TODO: Apply vllm-project/vllm#16007 - # so that we don't have to define `get_input_embeddings` in the - # multi-modal model once V0 is deprecated - inputs_embeds = self.model.get_input_embeddings(self.input_ids) + language_model = self.model.get_language_model() + inputs_embeds = language_model.get_input_embeddings(self.input_ids) if mm_embeds: is_mm_embeds: torch.Tensor = ... # TODO From e08deaa31ce7176034bc37eafb6d137799b2c6dc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 27 Aug 2025 13:38:08 +0000 Subject: [PATCH 04/45] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8dcc37c455d4..25073f5d2f19 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1521,6 +1521,9 @@ def execute_model( if self.supports_mm_inputs and get_pp_group().is_first_rank: is_embed = ... # TODO + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. inputs_embeds_scheduled = self.model.get_input_embeddings( input_ids=self.input_ids.gpu[:num_scheduled_tokens], multimodal_embeddings=mm_embeds or None, From 6a1307f473280f9e0f35704775cd9b78fb0ccb91 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 08:52:44 +0000 Subject: [PATCH 05/45] Update Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 17 +++++++++++------ vllm/v1/worker/tpu_model_runner.py | 19 ++++++++++++------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 25073f5d2f19..5d2c4115ed88 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -44,6 +44,7 @@ supports_transcription) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) +from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, PlaceholderRange) @@ -1519,16 +1520,20 @@ def execute_model( mm_embeds = [] if self.supports_mm_inputs and get_pp_group().is_first_rank: - is_embed = ... # TODO - # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. inputs_embeds_scheduled = self.model.get_input_embeddings( - input_ids=self.input_ids.gpu[:num_scheduled_tokens], - multimodal_embeddings=mm_embeds or None, - is_embed=is_embed if mm_embeds else None, - ) + self.input_ids.gpu[:num_scheduled_tokens]) + + if mm_embeds: + is_mm_embed = ... # TODO + + inputs_embeds_scheduled = _merge_multimodal_embeddings( + inputs_embeds_scheduled, + is_mm_embed, + multimodal_embeddings=mm_embeds, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds[:num_scheduled_tokens].copy_( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a357db8d08d5..ab69fde307fd 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -31,6 +31,7 @@ from vllm.model_executor.models.interfaces import supports_transcription from vllm.model_executor.models.interfaces_base import ( is_pooling_model, is_text_generation_model) +from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, PlaceholderRange) @@ -948,16 +949,20 @@ def _gather_mm_embeddings( def _get_model_inputs(self, input_ids: torch.Tensor, mm_embeds: list[torch.Tensor]): if self.supports_mm_inputs: - is_embed = ... # TODO - # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - inputs_embeds = self.model.get_input_embeddings( - input_ids=input_ids, - multimodal_embeddings=mm_embeds or None, - is_embed=is_embed if mm_embeds else None, - ) + inputs_embeds = self.model.get_input_embeddings(self.input_ids) + + if mm_embeds: + is_mm_embed = ... # TODO + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds, + is_mm_embed, + multimodal_embeddings=mm_embeds, + ) + return None, inputs_embeds else: # For text-only models, we use token ids as input. From 3a4740a8786159cc196c8fb3ea9fb369b01f87dc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 08:53:33 +0000 Subject: [PATCH 06/45] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index ab69fde307fd..692fac4e73ea 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -952,7 +952,7 @@ def _get_model_inputs(self, input_ids: torch.Tensor, # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - inputs_embeds = self.model.get_input_embeddings(self.input_ids) + inputs_embeds = self.model.get_input_embeddings(input_ids) if mm_embeds: is_mm_embed = ... # TODO From 68c54d8ea9e91d4a9c19ffee59c3ddc599fed6bc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 10:11:58 +0000 Subject: [PATCH 07/45] Draft Signed-off-by: DarkLight1337 --- vllm/v1/spec_decode/eagle.py | 22 ++++++++++------ vllm/v1/worker/gpu_model_runner.py | 39 ++++++++++++++++++++------- vllm/v1/worker/tpu_model_runner.py | 42 ++++++++++++++++++++++-------- 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0a0e9fed725c..46f223604c99 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -18,6 +18,7 @@ from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM +from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata @@ -157,6 +158,7 @@ def propose( next_token_ids: torch.Tensor, common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, + is_mm_embed: Optional[torch.Tensor] = None, mm_embeds: Optional[list[torch.Tensor]] = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] @@ -196,18 +198,22 @@ def propose( # copy inputs to buffer for cudagraph self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states - if self.is_multimodal_model: - input_ids = self.input_ids[:num_tokens] - inputs_embeds = self.model.get_input_embeddings( - input_ids, - multimodal_embeddings=mm_embeds or None, + + if mm_embeds: + assert is_mm_embed is not None + + inputs_embeds_scheduled = _merge_multimodal_embeddings( + self.input_ids[:num_tokens], + is_mm_embed, + multimodal_embeddings=mm_embeds, ) - self.inputs_embeds[:num_tokens] = inputs_embeds - inputs_embeds = self.inputs_embeds[:num_input_tokens] + self.inputs_embeds[:num_tokens] = inputs_embeds_scheduled + input_ids = None + inputs_embeds = self.inputs_embeds[:num_input_tokens] else: - inputs_embeds = None input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None with set_forward_context(per_layer_attn_metadata, self.vllm_config, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5d2c4115ed88..cc075022aa28 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1186,8 +1186,15 @@ def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", shift_computed_tokens: int = 0, - ) -> list[torch.Tensor]: - mm_embeds: list[torch.Tensor] = [] + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + is_mm_embed = torch.zeros( + scheduler_output.total_num_scheduled_tokens, + dtype=torch.bool, + pin_memory=self.pin_memory, + ) + mm_embeds = list[torch.Tensor]() + + req_start_idx = 0 for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] @@ -1196,6 +1203,7 @@ def _gather_mm_embeddings( req_state.num_computed_tokens + shift_computed_tokens mm_positions = req_state.mm_positions mm_hashes = req_state.mm_hashes + for i, pos_info in enumerate(mm_positions): start_pos = pos_info.offset num_encoder_tokens = pos_info.length @@ -1212,6 +1220,10 @@ def _gather_mm_embeddings( # in the decoder's KV cache. continue + req_start_pos = req_start_idx + start_pos + is_mm_embed[req_start_pos:req_start_pos + num_encoder_tokens] \ + = True if pos_info.is_embed is None else pos_info.is_embed + start_idx = max(num_computed_tokens - start_pos, 0) end_idx = min( num_computed_tokens - start_pos + num_scheduled_tokens, @@ -1232,7 +1244,10 @@ def _gather_mm_embeddings( is_embed=is_embed, ) mm_embeds.append(mm_embeds_item) - return mm_embeds + + req_start_idx += num_scheduled_tokens + + return is_mm_embed, mm_embeds def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. @@ -1515,9 +1530,10 @@ def execute_model( if self.supports_mm_inputs: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) - mm_embeds = self._gather_mm_embeddings(scheduler_output) + is_mm_embed, mm_embeds = self._gather_mm_embeddings( + scheduler_output) else: - mm_embeds = [] + is_mm_embed, mm_embeds = torch.tensor(False), [] if self.supports_mm_inputs and get_pp_group().is_first_rank: # NOTE(woosuk): To unify token ids and soft tokens (vision @@ -1527,8 +1543,6 @@ def execute_model( self.input_ids.gpu[:num_scheduled_tokens]) if mm_embeds: - is_mm_embed = ... # TODO - inputs_embeds_scheduled = _merge_multimodal_embeddings( inputs_embeds_scheduled, is_mm_embed, @@ -1868,10 +1882,14 @@ def propose_draft_token_ids( [h[token_indices] for h in aux_hidden_states], dim=-1) else: target_hidden_states = hidden_states[token_indices] - mm_embeds = None + if self.supports_mm_inputs: - mm_embeds = self._gather_mm_embeddings(scheduler_output, - shift_computed_tokens=1) + is_mm_embed, mm_embeds = self._gather_mm_embeddings( + scheduler_output, + shift_computed_tokens=1, + ) + else: + is_mm_embed, mm_embeds = torch.tensor(False), [] draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, @@ -1880,6 +1898,7 @@ def propose_draft_token_ids( next_token_ids=next_token_ids, sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, + is_mm_embed=is_mm_embed, mm_embeds=mm_embeds, ) return draft_token_ids diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 692fac4e73ea..7b4ead8661b6 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -907,8 +907,15 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", - ) -> list[torch.Tensor]: - mm_embeds: list[torch.Tensor] = [] + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + is_mm_embed = torch.zeros( + scheduler_output.total_num_scheduled_tokens, + dtype=torch.bool, + pin_memory=self.pin_memory, + ) + mm_embeds = list[torch.Tensor]() + + req_start_idx = 0 for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] @@ -936,6 +943,10 @@ def _gather_mm_embeddings( # in the decoder's KV cache. continue + req_start_pos = req_start_idx + start_pos + is_mm_embed[req_start_pos:req_start_pos + num_encoder_tokens] \ + = True + mm_hash = mm_hashes[i] encoder_output = self.encoder_cache.get(mm_hash, None) assert encoder_output is not None,\ @@ -944,10 +955,17 @@ def _gather_mm_embeddings( " be contiguous and embeddings." encoder_output = self.encoder_cache[mm_hash] mm_embeds.append(encoder_output) - return mm_embeds - def _get_model_inputs(self, input_ids: torch.Tensor, - mm_embeds: list[torch.Tensor]): + req_start_idx += num_scheduled_tokens + + return is_mm_embed, mm_embeds + + def _get_model_inputs( + self, + input_ids: torch.Tensor, + is_mm_embed: torch.Tensor, + mm_embeds: list[torch.Tensor], + ): if self.supports_mm_inputs: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -955,8 +973,6 @@ def _get_model_inputs(self, input_ids: torch.Tensor, inputs_embeds = self.model.get_input_embeddings(input_ids) if mm_embeds: - is_mm_embed = ... # TODO - inputs_embeds = _merge_multimodal_embeddings( inputs_embeds, is_mm_embed, @@ -990,9 +1006,11 @@ def execute_model( if self.supports_mm_inputs: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) - mm_embeds = self._gather_mm_embeddings(scheduler_output) + is_mm_embed, mm_embeds = self._gather_mm_embeddings( + scheduler_output) else: - mm_embeds = [] + is_mm_embed, mm_embeds = torch.tensor(False), [] + xm.mark_step() # Prepare inputs, the requests might be split into multiple # executions, combine the result of each execution. @@ -1009,7 +1027,7 @@ def execute_model( attn_metadata, logits_indices, padded_num_reqs, num_reqs,\ end_index = self._prepare_inputs(scheduler_output, start_index) input_ids, inputs_embeds = self._get_model_inputs( - self.input_ids, mm_embeds) + self.input_ids, is_mm_embed, mm_embeds) xm.mark_step() # Run the decoder with set_forward_context( @@ -1366,6 +1384,7 @@ def _precompile_mm_encoder(self) -> None: placeholders_ids = placeholders_ids.to(self.device) # Assign outputs or the graph will be cut short. a, b = self._get_model_inputs(placeholders_ids, + torch.tensor(True), [mm_embeds]) assert a is None xm.mark_step() @@ -1377,7 +1396,8 @@ def _precompile_mm_encoder(self) -> None: dtype=torch.int32, device="cpu") placeholders_ids = placeholders_ids.to(self.device) - a, b = self._get_model_inputs(placeholders_ids, []) + a, b = self._get_model_inputs(placeholders_ids, + torch.tensor(False), []) assert a is None xm.mark_step() From 6ddc91e7b5484b5c2324efff65e8203f9e60fe05 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 10:41:25 +0000 Subject: [PATCH 08/45] Fix device Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cc075022aa28..8bfd022b0361 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1247,7 +1247,7 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - return is_mm_embed, mm_embeds + return is_mm_embed.to(self.device), mm_embeds def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7b4ead8661b6..2e6fbdc6eb74 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -958,7 +958,7 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - return is_mm_embed, mm_embeds + return is_mm_embed.to(self.device), mm_embeds def _get_model_inputs( self, From 28cc8cbf1692434ee039eee663b768e897c4290f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 10:59:56 +0000 Subject: [PATCH 09/45] Persistent buffer Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 15 ++++++----- vllm/v1/worker/tpu_model_runner.py | 42 ++++++++---------------------- 2 files changed, 20 insertions(+), 37 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8bfd022b0361..85766954f082 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -258,6 +258,10 @@ def __init__( dtype=self.dtype, device=self.device) + # Only relevant for multimodal models + self.is_mm_embed = self._make_buffer(self.max_num_tokens, + dtype=torch.bool) + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: # NOTE: `mrope_positions` is implemented with one additional dummy @@ -1187,11 +1191,7 @@ def _gather_mm_embeddings( scheduler_output: "SchedulerOutput", shift_computed_tokens: int = 0, ) -> tuple[torch.Tensor, list[torch.Tensor]]: - is_mm_embed = torch.zeros( - scheduler_output.total_num_scheduled_tokens, - dtype=torch.bool, - pin_memory=self.pin_memory, - ) + is_mm_embed = self.is_mm_embed.cpu mm_embeds = list[torch.Tensor]() req_start_idx = 0 @@ -1247,7 +1247,10 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - return is_mm_embed.to(self.device), mm_embeds + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) + + return self.is_mm_embed.gpu[:total_num_scheduled_tokens], mm_embeds def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2e6fbdc6eb74..44f7e9b74de0 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -262,6 +262,12 @@ def __init__( pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + # Only relevant for multimodal models + self.is_mm_embed_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.bool, + device="cpu", + pin_memory=self.pin_memory) + # Range tensor with values [0 .. self.max_num_tokens - 1]. # Used to initialize positions / context_lens / seq_lens # Keep in int64 to avoid overflow with long context @@ -810,31 +816,6 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", return per_layer_attn_metadata, logits_indices, padded_num_reqs,\ num_reqs, end_index - def _scatter_placeholders( - self, - embeds: torch.Tensor, - is_embed: Optional[torch.Tensor], - ) -> torch.Tensor: - if is_embed is None: - return embeds - - placeholders = embeds.new_full( - (is_embed.shape[0], embeds.shape[-1]), - fill_value=torch.nan, - ) - placeholders[is_embed] = embeds - return placeholders - - def _gather_placeholders( - self, - placeholders: torch.Tensor, - is_embed: Optional[torch.Tensor], - ) -> torch.Tensor: - if is_embed is None: - return placeholders - - return placeholders[is_embed] - def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs if not scheduled_encoder_inputs: @@ -908,11 +889,7 @@ def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", ) -> tuple[torch.Tensor, list[torch.Tensor]]: - is_mm_embed = torch.zeros( - scheduler_output.total_num_scheduled_tokens, - dtype=torch.bool, - pin_memory=self.pin_memory, - ) + is_mm_embed = self.is_mm_embed_cpu mm_embeds = list[torch.Tensor]() req_start_idx = 0 @@ -958,7 +935,10 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - return is_mm_embed.to(self.device), mm_embeds + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + is_mm_embed = is_mm_embed[:total_num_scheduled_tokens].to(self.device) + + return is_mm_embed, mm_embeds def _get_model_inputs( self, From c33590803f06c04e70498bb711f83115e5b54c21 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 11:07:07 +0000 Subject: [PATCH 10/45] Avoid unnecessary initialization Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 5 +++-- vllm/v1/worker/tpu_model_runner.py | 9 +++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 85766954f082..365086ff4f43 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -259,8 +259,9 @@ def __init__( device=self.device) # Only relevant for multimodal models - self.is_mm_embed = self._make_buffer(self.max_num_tokens, - dtype=torch.bool) + if self.supports_mm_inputs: + self.is_mm_embed = self._make_buffer(self.max_num_tokens, + dtype=torch.bool) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 44f7e9b74de0..d575557a0bc9 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -263,10 +263,11 @@ def __init__( self.seq_lens_np = self.seq_lens_cpu.numpy() # Only relevant for multimodal models - self.is_mm_embed_cpu = torch.zeros(self.max_num_tokens, - dtype=torch.bool, - device="cpu", - pin_memory=self.pin_memory) + if self.supports_mm_inputs: + self.is_mm_embed_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.bool, + device="cpu", + pin_memory=self.pin_memory) # Range tensor with values [0 .. self.max_num_tokens - 1]. # Used to initialize positions / context_lens / seq_lens From cbb70ea9035f4f7b5b0981b9de251085400c61f9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 14:15:07 +0000 Subject: [PATCH 11/45] Fix reset Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- vllm/v1/worker/tpu_model_runner.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 365086ff4f43..ce8a38d2790e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1192,10 +1192,13 @@ def _gather_mm_embeddings( scheduler_output: "SchedulerOutput", shift_computed_tokens: int = 0, ) -> tuple[torch.Tensor, list[torch.Tensor]]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + is_mm_embed = self.is_mm_embed.cpu + is_mm_embed[:total_num_scheduled_tokens] = False mm_embeds = list[torch.Tensor]() - req_start_idx = 0 + for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] @@ -1248,7 +1251,6 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) return self.is_mm_embed.gpu[:total_num_scheduled_tokens], mm_embeds diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d575557a0bc9..d033039358f7 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -890,10 +890,13 @@ def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", ) -> tuple[torch.Tensor, list[torch.Tensor]]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + is_mm_embed = self.is_mm_embed_cpu + is_mm_embed[:total_num_scheduled_tokens] = False mm_embeds = list[torch.Tensor]() - req_start_idx = 0 + for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] @@ -901,6 +904,7 @@ def _gather_mm_embeddings( num_computed_tokens = req_state.num_computed_tokens mm_positions = req_state.mm_positions mm_hashes = req_state.mm_hashes + # TODO unroll loop and assume/enforce --disable_chunked_mm_input # NOTE (NickLucche) here we diverge from logic in other runners, as # we assume to only have whole mm items to process. Hence we avoid @@ -936,7 +940,6 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens is_mm_embed = is_mm_embed[:total_num_scheduled_tokens].to(self.device) return is_mm_embed, mm_embeds From 76f2925aea471918afc264636c6ac2c342b800e0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 14:24:46 +0000 Subject: [PATCH 12/45] Update Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 8 ++++---- vllm/v1/worker/tpu_model_runner.py | 26 +++++++++++++++----------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ce8a38d2790e..7ec2695bd95d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1224,10 +1224,6 @@ def _gather_mm_embeddings( # in the decoder's KV cache. continue - req_start_pos = req_start_idx + start_pos - is_mm_embed[req_start_pos:req_start_pos + num_encoder_tokens] \ - = True if pos_info.is_embed is None else pos_info.is_embed - start_idx = max(num_computed_tokens - start_pos, 0) end_idx = min( num_computed_tokens - start_pos + num_scheduled_tokens, @@ -1243,6 +1239,10 @@ def _gather_mm_embeddings( if (is_embed := pos_info.is_embed) is not None: is_embed = is_embed[start_idx:end_idx] + req_start_pos = req_start_idx + start_pos + is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ + = True if is_embed is None else is_embed + mm_embeds_item = gather_mm_placeholders( encoder_output[start_idx:end_idx], is_embed=is_embed, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d033039358f7..2f4fac234166 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -876,12 +876,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): # NOTE (NickLucche) here we diverge from logic in other runners, as we # assume to only have whole mm items to process. Hence we avoid the # intrinsic dynamism that `scatter_mm_placeholders` introduces. - for (mm_hash, pos_info), output in zip( - mm_hashes_pos, - encoder_outputs, - ): - if req_id not in self.encoder_cache: - self.encoder_cache[req_id] = {} + for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): assert pos_info.is_embed is None, "Expected all positions to be"\ " contiguous and embeddings." self.encoder_cache[mm_hash] = output @@ -925,18 +920,27 @@ def _gather_mm_embeddings( # in the decoder's KV cache. continue - req_start_pos = req_start_idx + start_pos - is_mm_embed[req_start_pos:req_start_pos + num_encoder_tokens] \ - = True + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens, + ) + assert start_idx < end_idx mm_hash = mm_hashes[i] encoder_output = self.encoder_cache.get(mm_hash, None) assert encoder_output is not None,\ f"Encoder cache miss for {mm_hash}." + assert pos_info.is_embed is None, "Expected all positions to"\ " be contiguous and embeddings." - encoder_output = self.encoder_cache[mm_hash] - mm_embeds.append(encoder_output) + + req_start_pos = req_start_idx + start_pos + is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ + = True + + mm_embeds_item = encoder_output[start_idx:end_idx] + mm_embeds.append(mm_embeds_item) req_start_idx += num_scheduled_tokens From b6e87759d371a48802e5da995178d1c07bfee653 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 28 Aug 2025 14:30:33 +0000 Subject: [PATCH 13/45] Simplify Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7ec2695bd95d..27c6ff8c7554 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1251,9 +1251,9 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) + is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) - return self.is_mm_embed.gpu[:total_num_scheduled_tokens], mm_embeds + return is_mm_embed, mm_embeds def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. From f71a40b192f7323b91381592c73b259baba51138 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 1 Sep 2025 02:41:27 +0000 Subject: [PATCH 14/45] Use padded tokens Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a24f13701084..229766b7fb6d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -885,9 +885,11 @@ def _gather_mm_embeddings( scheduler_output: "SchedulerOutput", ) -> tuple[torch.Tensor, list[torch.Tensor]]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + padded_total_num_scheduled_tokens = _get_padded_token_len( + self.num_tokens_paddings, total_num_scheduled_tokens) is_mm_embed = self.is_mm_embed_cpu - is_mm_embed[:total_num_scheduled_tokens] = False + is_mm_embed[:padded_total_num_scheduled_tokens] = False mm_embeds = list[torch.Tensor]() req_start_idx = 0 @@ -943,7 +945,8 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - is_mm_embed = is_mm_embed[:total_num_scheduled_tokens].to(self.device) + is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens] \ + .to(self.device) return is_mm_embed, mm_embeds From 3af1bdb73e3f6424bc80f59fd4664706a6a083d9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 1 Sep 2025 11:14:34 +0000 Subject: [PATCH 15/45] Fix wrong device Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- vllm/v1/worker/tpu_model_runner.py | 18 ++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6d4f373ac443..2fb288dedf6f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1536,7 +1536,8 @@ def execute_model( is_mm_embed, mm_embeds = self._gather_mm_embeddings( scheduler_output) else: - is_mm_embed, mm_embeds = torch.tensor(False), [] + is_mm_embed, mm_embeds = torch.tensor(False, + device=self.device), [] if self.supports_mm_inputs and get_pp_group().is_first_rank: # NOTE(woosuk): To unify token ids and soft tokens (vision @@ -1892,7 +1893,8 @@ def propose_draft_token_ids( shift_computed_tokens=1, ) else: - is_mm_embed, mm_embeds = torch.tensor(False), [] + is_mm_embed, mm_embeds = torch.tensor(False, + device=self.device), [] draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 229766b7fb6d..91be1f597bee 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -999,7 +999,8 @@ def execute_model( is_mm_embed, mm_embeds = self._gather_mm_embeddings( scheduler_output) else: - is_mm_embed, mm_embeds = torch.tensor(False), [] + is_mm_embed, mm_embeds = torch.tensor(False, + device=self.device), [] xm.mark_step() # Prepare inputs, the requests might be split into multiple @@ -1373,9 +1374,11 @@ def _precompile_mm_encoder(self) -> None: placeholders_ids = placeholders_ids.to(self.device) # Assign outputs or the graph will be cut short. - a, b = self._get_model_inputs(placeholders_ids, - torch.tensor(True), - [mm_embeds]) + a, b = self._get_model_inputs( + placeholders_ids, + torch.tensor(True, device=self.device), + [mm_embeds], + ) assert a is None xm.mark_step() @@ -1386,8 +1389,11 @@ def _precompile_mm_encoder(self) -> None: dtype=torch.int32, device="cpu") placeholders_ids = placeholders_ids.to(self.device) - a, b = self._get_model_inputs(placeholders_ids, - torch.tensor(False), []) + a, b = self._get_model_inputs( + placeholders_ids, + torch.tensor(False, device=self.device), + [], + ) assert a is None xm.mark_step() From 975569dd584801315e5bf40f9f460c293668cec2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 2 Sep 2025 10:47:33 +0000 Subject: [PATCH 16/45] Debug Signed-off-by: DarkLight1337 --- vllm/model_executor/models/utils.py | 54 +++++++++++++++++++++-------- vllm/v1/worker/gpu_model_runner.py | 17 +++++++++ 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 28cfefac30dd..c5fac8574807 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -405,6 +405,22 @@ def merge_multimodal_embeddings_from_map( return inputs_embeds +def _validate_multimodal_embeddings( + is_multimodal: torch.Tensor, + multimodal_embeddings: NestedTensors, + mm_embeds_flat: torch.Tensor, +) -> None: + num_expected_tokens = is_multimodal.sum().item() + assert isinstance(num_expected_tokens, int) + + if mm_embeds_flat.shape[0] != num_expected_tokens: + expr = _embedding_count_expression(multimodal_embeddings) + + raise ValueError( + f"Attempted to assign {expr} = {mm_embeds_flat.shape[0]} " + f"multimodal tokens to {num_expected_tokens} placeholders") + + def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, is_multimodal: torch.Tensor, @@ -418,23 +434,31 @@ def _merge_multimodal_embeddings( Note: This updates ``inputs_embeds`` in place. """ - flattened = _flatten_embeddings(multimodal_embeddings) + mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) + + if envs.VLLM_LOGGING_LEVEL == "DEBUG": + _validate_multimodal_embeddings( + is_multimodal, + multimodal_embeddings, + mm_embeds_flat, + ) + try: - # This is equivalent to: inputs_embeds[is_multimodal] = flattened. - inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), - flattened.to(dtype=inputs_embeds.dtype)) + input_dtype = inputs_embeds.dtype + inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) + # This is equivalent to: + # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) + # inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), + # mm_embeds_flat.to(dtype=input_dtype)) except RuntimeError as e: - num_expected_tokens = is_multimodal.sum().item() - assert isinstance(num_expected_tokens, int) - - if flattened.shape[0] != num_expected_tokens: - expr = _embedding_count_expression(multimodal_embeddings) - raise ValueError( - f"Attempted to assign {expr} = {flattened.shape[0]} " - f"multimodal tokens to {num_expected_tokens} placeholders" - ) from e - else: - raise ValueError("Error during masked scatter operation") from e + # Raise the validation error if possible instead of the original error + _validate_multimodal_embeddings( + is_multimodal, + multimodal_embeddings, + mm_embeds_flat, + ) + + raise ValueError("Error during masked scatter operation") from e return inputs_embeds diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 023b7417b69a..e27c2cbde2b7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1231,6 +1231,22 @@ def _gather_mm_embeddings( req_start_pos = req_start_idx + start_pos is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ = True if is_embed is None else is_embed + print("req_start_pos", req_start_pos) + print("set", req_start_pos + start_idx, ":", + req_start_pos + end_idx, "to", + True if is_embed is None else is_embed) + print((is_mm_embed[:-1] != is_mm_embed[1:]).nonzero() + 1) + # For tests/models/multimodal/generation/test_common.py::test_custom_inputs_models[llava-test_case3], + # the expected output for the first item is: + # tensor([[ 5], + # [1157]]) + # the expected output for the second item is: + # tensor([[ 6], + # [1158], + # [1171], + # [2904], + # [2922], + # [3498]]) mm_embeds_item = gather_mm_placeholders( encoder_output[start_idx:end_idx], @@ -1241,6 +1257,7 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) + print("total_num_scheduled_tokens", total_num_scheduled_tokens) return is_mm_embed, mm_embeds From c0015810cb9ef8f2f0144012d5cc180d7fc226f5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 11:19:10 +0000 Subject: [PATCH 17/45] Fix? Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 20 ++------------------ vllm/v1/worker/tpu_model_runner.py | 2 +- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 52b850eb0ca2..8463de915e3b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1505,6 +1505,7 @@ def _gather_mm_embeddings( req_state = self.requests[req_id] num_computed_tokens = \ req_state.num_computed_tokens + shift_computed_tokens + for mm_feature in req_state.mm_features: pos_info = mm_feature.mm_position start_pos = pos_info.offset @@ -1537,25 +1538,9 @@ def _gather_mm_embeddings( if (is_embed := pos_info.is_embed) is not None: is_embed = is_embed[start_idx:end_idx] - req_start_pos = req_start_idx + start_pos + req_start_pos = req_start_idx + start_pos - num_computed_tokens is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ = True if is_embed is None else is_embed - print("req_start_pos", req_start_pos) - print("set", req_start_pos + start_idx, ":", - req_start_pos + end_idx, "to", - True if is_embed is None else is_embed) - print((is_mm_embed[:-1] != is_mm_embed[1:]).nonzero() + 1) - # For tests/models/multimodal/generation/test_common.py::test_custom_inputs_models[llava-test_case3], - # the expected output for the first item is: - # tensor([[ 5], - # [1157]]) - # the expected output for the second item is: - # tensor([[ 6], - # [1158], - # [1171], - # [2904], - # [2922], - # [3498]]) mm_embeds_item = gather_mm_placeholders( encoder_output[start_idx:end_idx], @@ -1566,7 +1551,6 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) - print("total_num_scheduled_tokens", total_num_scheduled_tokens) return is_mm_embed, mm_embeds diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a7044242841d..cc01bd80a624 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -934,7 +934,7 @@ def _gather_mm_embeddings( assert pos_info.is_embed is None, "Expected all positions to"\ " be contiguous and embeddings." - req_start_pos = req_start_idx + start_pos + req_start_pos = req_start_idx + start_pos - num_computed_tokens is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ = True From 9e4512c8fda57a5c39af0eadda713d0ddbc25932 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 11:29:51 +0000 Subject: [PATCH 18/45] Simplify the code Signed-off-by: DarkLight1337 --- vllm/v1/spec_decode/eagle.py | 8 +++---- vllm/v1/worker/gpu_model_runner.py | 20 ++++++++--------- vllm/v1/worker/tpu_model_runner.py | 35 +++++++++++++++--------------- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index ac4046580e1c..11c0973c1af6 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -169,8 +169,8 @@ def propose( last_token_indices: Optional[torch.Tensor], common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, - is_mm_embed: Optional[torch.Tensor] = None, - mm_embeds: Optional[list[torch.Tensor]] = None, + mm_embed_inputs: Optional[tuple[torch.Tensor, + list[torch.Tensor]]] = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] @@ -214,8 +214,8 @@ def propose( self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states - if mm_embeds: - assert is_mm_embed is not None + if mm_embed_inputs: + is_mm_embed, mm_embeds = mm_embed_inputs inputs_embeds_scheduled = _merge_multimodal_embeddings( self.input_ids[:num_tokens], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8463de915e3b..093b0a5a8227 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1911,12 +1911,11 @@ def _preprocess( inputs_embeds_scheduled = self.model.get_input_embeddings( self.input_ids.gpu[:num_scheduled_tokens]) - if mm_embeds: - inputs_embeds_scheduled = _merge_multimodal_embeddings( - inputs_embeds_scheduled, - is_mm_embed, - multimodal_embeddings=mm_embeds, - ) + inputs_embeds_scheduled = _merge_multimodal_embeddings( + inputs_embeds_scheduled, + is_mm_embed, + multimodal_embeddings=mm_embeds, + ) # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_( @@ -2448,13 +2447,12 @@ def propose_draft_token_ids( target_hidden_states = hidden_states[token_indices] if self.supports_mm_inputs: - is_mm_embed, mm_embeds = self._gather_mm_embeddings( + mm_embed_inputs = self._gather_mm_embeddings( scheduler_output, shift_computed_tokens=1, ) else: - is_mm_embed, mm_embeds = torch.tensor(False, - device=self.device), [] + mm_embed_inputs = None draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, @@ -2464,9 +2462,9 @@ def propose_draft_token_ids( last_token_indices=token_indices_to_sample, sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, - is_mm_embed=is_mm_embed, - mm_embeds=mm_embeds, + mm_embed_inputs=mm_embed_inputs, ) + return draft_token_ids def propose_ngram_draft_token_ids( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index cc01bd80a624..8351c13aff77 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -951,21 +951,21 @@ def _gather_mm_embeddings( def _get_model_inputs( self, input_ids: torch.Tensor, - is_mm_embed: torch.Tensor, - mm_embeds: list[torch.Tensor], + mm_embed_inputs: Optional[tuple[torch.Tensor, list[torch.Tensor]]], ): - if self.supports_mm_inputs: + if mm_embed_inputs: + is_mm_embed, mm_embeds = mm_embed_inputs + # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. inputs_embeds = self.model.get_input_embeddings(input_ids) - if mm_embeds: - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds, - is_mm_embed, - multimodal_embeddings=mm_embeds, - ) + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds, + is_mm_embed, + multimodal_embeddings=mm_embeds, + ) return None, inputs_embeds else: @@ -994,11 +994,9 @@ def execute_model( if self.supports_mm_inputs: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) - is_mm_embed, mm_embeds = self._gather_mm_embeddings( - scheduler_output) + mm_embed_inputs = self._gather_mm_embeddings(scheduler_output) else: - is_mm_embed, mm_embeds = torch.tensor(False, - device=self.device), [] + mm_embed_inputs = None xm.mark_step() # Prepare inputs, the requests might be split into multiple @@ -1016,7 +1014,7 @@ def execute_model( attn_metadata, logits_indices, padded_num_reqs, num_reqs,\ end_index = self._prepare_inputs(scheduler_output, start_index) input_ids, inputs_embeds = self._get_model_inputs( - self.input_ids, is_mm_embed, mm_embeds) + self.input_ids, mm_embed_inputs) xm.mark_step() # Run the decoder with set_forward_context( @@ -1374,8 +1372,10 @@ def _precompile_mm_encoder(self) -> None: # Assign outputs or the graph will be cut short. a, b = self._get_model_inputs( placeholders_ids, - torch.tensor(True, device=self.device), - [mm_embeds], + mm_embed_inputs=( + torch.tensor(True, device=self.device), + [mm_embeds], + ), ) assert a is None xm.mark_step() @@ -1389,8 +1389,7 @@ def _precompile_mm_encoder(self) -> None: placeholders_ids = placeholders_ids.to(self.device) a, b = self._get_model_inputs( placeholders_ids, - torch.tensor(False, device=self.device), - [], + mm_embed_inputs=None, ) assert a is None xm.mark_step() From e002d444f979b934d7d68c7a3c60d5e242660d03 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 13:09:09 +0000 Subject: [PATCH 19/45] Reduce diffs Signed-off-by: DarkLight1337 --- vllm/model_executor/models/utils.py | 63 +++++++++++++---------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 3f1cf93005b0..b407052182f3 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -405,22 +405,6 @@ def merge_multimodal_embeddings_from_map( return inputs_embeds -def _validate_multimodal_embeddings( - is_multimodal: torch.Tensor, - multimodal_embeddings: NestedTensors, - mm_embeds_flat: torch.Tensor, -) -> None: - num_expected_tokens = is_multimodal.sum().item() - assert isinstance(num_expected_tokens, int) - - if mm_embeds_flat.shape[0] != num_expected_tokens: - expr = _embedding_count_expression(multimodal_embeddings) - - raise ValueError( - f"Attempted to assign {expr} = {mm_embeds_flat.shape[0]} " - f"multimodal tokens to {num_expected_tokens} placeholders") - - def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, is_multimodal: torch.Tensor, @@ -434,29 +418,38 @@ def _merge_multimodal_embeddings( Note: This updates ``inputs_embeds`` in place. """ - mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) + if len(multimodal_embeddings) == 0: + if is_multimodal.any(): + num_expected_tokens = is_multimodal.sum().item() - if envs.VLLM_LOGGING_LEVEL == "DEBUG": - _validate_multimodal_embeddings( - is_multimodal, - multimodal_embeddings, - mm_embeds_flat, - ) + raise ValueError( + f"Attempted to assign 0 " + f"multimodal tokens to {num_expected_tokens} placeholders") + + return inputs_embeds + + mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) + input_dtype = inputs_embeds.dtype try: - input_dtype = inputs_embeds.dtype - inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) - # This is equivalent to: - # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) - # inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), - # mm_embeds_flat.to(dtype=input_dtype)) + if envs.VLLM_LOGGING_LEVEL == "DEBUG": + inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) + else: + # NOTE: Unlike the debug code, this does not raise an error if + # is_multimodal.sum() < len(mm_embeds_flat) + inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), + mm_embeds_flat.to(dtype=input_dtype)) except RuntimeError as e: - # Raise the validation error if possible instead of the original error - _validate_multimodal_embeddings( - is_multimodal, - multimodal_embeddings, - mm_embeds_flat, - ) + num_actual_tokens = len(mm_embeds_flat) + num_expected_tokens = is_multimodal.sum().item() + + if num_actual_tokens != num_expected_tokens: + expr = _embedding_count_expression(multimodal_embeddings) + + raise ValueError( + f"Attempted to assign {expr} = {num_actual_tokens} " + f"multimodal tokens to {num_expected_tokens} placeholders" + ) from e raise ValueError("Error during masked scatter operation") from e From 1934f252d9e557cec7c72bcc3b55e094d1a4e4b8 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 13:19:56 +0000 Subject: [PATCH 20/45] Avoid intermediate variable Signed-off-by: DarkLight1337 --- vllm/v1/spec_decode/eagle.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 11c0973c1af6..83687962b13a 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -217,12 +217,11 @@ def propose( if mm_embed_inputs: is_mm_embed, mm_embeds = mm_embed_inputs - inputs_embeds_scheduled = _merge_multimodal_embeddings( + self.inputs_embeds[:num_tokens] = _merge_multimodal_embeddings( self.input_ids[:num_tokens], is_mm_embed, multimodal_embeddings=mm_embeds, ) - self.inputs_embeds[:num_tokens] = inputs_embeds_scheduled input_ids = None inputs_embeds = self.inputs_embeds[:num_input_tokens] From 573cb4bad2d25c755e085152291c994e500d8b08 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 13:25:27 +0000 Subject: [PATCH 21/45] Standardize input embeddings logic Signed-off-by: DarkLight1337 --- vllm/v1/spec_decode/eagle.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 83687962b13a..e906518b422a 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -346,14 +346,15 @@ def propose( self.input_ids[:batch_size] = input_ids self.positions[:batch_size] = clamped_positions self.hidden_states[:batch_size] = hidden_states - if self.is_multimodal_model: - inputs_embeds = self.model.get_input_embeddings(input_ids) - self.inputs_embeds[:batch_size] = inputs_embeds - inputs_embeds = self.inputs_embeds[:input_batch_size] + if mm_embed_inputs: + self.inputs_embeds[:batch_size] = \ + self.model.get_input_embeddings(input_ids) + input_ids = None + inputs_embeds = self.inputs_embeds[:input_batch_size] else: - inputs_embeds = None input_ids = self.input_ids[:input_batch_size] + inputs_embeds = None # Run the model. with set_forward_context(per_layer_attn_metadata, From fa5e688ad3ab3b7ff3fe868de10981381b25ffc7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 13:29:57 +0000 Subject: [PATCH 22/45] Cleanup Signed-off-by: DarkLight1337 --- vllm/v1/spec_decode/eagle.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index e906518b422a..5fd688753d43 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -19,6 +19,7 @@ from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.model_executor.models.utils import _merge_multimodal_embeddings +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata @@ -76,8 +77,10 @@ def __init__( # hidden size (e.g., Llama 3.3 70B). self.hidden_size = self.draft_model_config.get_hidden_size() - self.is_multimodal_model = vllm_config.model_config \ - .is_multimodal_model + # Multi-modal data support + self.mm_registry = MULTIMODAL_REGISTRY + self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( + vllm_config.model_config) self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and @@ -214,7 +217,9 @@ def propose( self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states - if mm_embed_inputs: + if self.supports_mm_inputs: + assert mm_embed_inputs is not None, ( + "Multi-modal embeddings should be passed from model runner") is_mm_embed, mm_embeds = mm_embed_inputs self.inputs_embeds[:num_tokens] = _merge_multimodal_embeddings( @@ -346,7 +351,7 @@ def propose( self.input_ids[:batch_size] = input_ids self.positions[:batch_size] = clamped_positions self.hidden_states[:batch_size] = hidden_states - if mm_embed_inputs: + if self.supports_mm_inputs: self.inputs_embeds[:batch_size] = \ self.model.get_input_embeddings(input_ids) @@ -861,7 +866,7 @@ def dummy_run( ) -> None: with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): - if self.is_multimodal_model: + if self.supports_mm_inputs: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] else: From 0799fdb04150f9c6815512950d70fa58fb460758 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 16:07:52 +0000 Subject: [PATCH 23/45] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 8351c13aff77..e6bfae28f65c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -953,7 +953,9 @@ def _get_model_inputs( input_ids: torch.Tensor, mm_embed_inputs: Optional[tuple[torch.Tensor, list[torch.Tensor]]], ): - if mm_embed_inputs: + if self.supports_mm_inputs: + assert mm_embed_inputs is not None, ( + "Multi-modal embeddings should be passed from model runner") is_mm_embed, mm_embeds = mm_embed_inputs # NOTE(woosuk): To unify token ids and soft tokens (vision From 7f58edc4280806a0c866960b3ae54ec070a27182 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 16:27:33 +0000 Subject: [PATCH 24/45] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index e6bfae28f65c..534975e69ce4 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -954,20 +954,19 @@ def _get_model_inputs( mm_embed_inputs: Optional[tuple[torch.Tensor, list[torch.Tensor]]], ): if self.supports_mm_inputs: - assert mm_embed_inputs is not None, ( - "Multi-modal embeddings should be passed from model runner") - is_mm_embed, mm_embeds = mm_embed_inputs - # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. inputs_embeds = self.model.get_input_embeddings(input_ids) - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds, - is_mm_embed, - multimodal_embeddings=mm_embeds, - ) + if mm_embed_inputs: + is_mm_embed, mm_embeds = mm_embed_inputs + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds, + is_mm_embed, + multimodal_embeddings=mm_embeds, + ) return None, inputs_embeds else: From 1e9ec64486ed8fd0458afa2d41f937a52f0b2cef Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 18 Sep 2025 17:16:40 +0000 Subject: [PATCH 25/45] Comment out debug path Signed-off-by: DarkLight1337 --- vllm/model_executor/models/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index b407052182f3..14335610bb9c 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -432,13 +432,13 @@ def _merge_multimodal_embeddings( input_dtype = inputs_embeds.dtype try: - if envs.VLLM_LOGGING_LEVEL == "DEBUG": - inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) - else: - # NOTE: Unlike the debug code, this does not raise an error if - # is_multimodal.sum() < len(mm_embeds_flat) - inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), - mm_embeds_flat.to(dtype=input_dtype)) + # For debugging + # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype) + + # NOTE: This can avoid D2H sync (#22105), but fails to + # raise an error if is_multimodal.sum() < len(mm_embeds_flat) + inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), + mm_embeds_flat.to(dtype=input_dtype)) except RuntimeError as e: num_actual_tokens = len(mm_embeds_flat) num_expected_tokens = is_multimodal.sum().item() From a9f7e843171e68c9cc044496689b72b0a11db536 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 19 Sep 2025 10:55:05 +0000 Subject: [PATCH 26/45] fix tpu recompilations Signed-off-by: NickLucche --- vllm/model_executor/models/utils.py | 5 ++++- vllm/v1/worker/tpu_model_runner.py | 22 +++++++++++++++++----- vllm/v1/worker/tpu_worker.py | 1 + 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 14335610bb9c..282fda18e667 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -16,6 +16,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available, is_uva_available) @@ -419,7 +420,9 @@ def _merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if len(multimodal_embeddings) == 0: - if is_multimodal.any(): + # On TPU, the .any check will force a device sync and graph break + # unless we precompile faulty cases. + if not current_platform.is_tpu() and is_multimodal.any(): num_expected_tokens = is_multimodal.sum().item() raise ValueError( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 8832b1a833d6..a96790f01753 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -939,14 +939,22 @@ def _gather_mm_embeddings( is_mm_embed[req_start_pos+start_idx:req_start_pos + end_idx] \ = True - mm_embeds_item = encoder_output[start_idx:end_idx] - mm_embeds.append(mm_embeds_item) + # Only whole mm items are processed + mm_embeds.append(encoder_output) req_start_idx += num_scheduled_tokens - is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens] \ - .to(self.device) + is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens] + if not len(mm_embeds) and is_mm_embed.any(): + # Do this safety check here, on CPU, to avoid precompiling faulty + # cases to check on device tensors. + num_expected_tokens = is_mm_embed.sum().item() + raise ValueError( + f"Attempted to assign 0 " + f"multimodal tokens to {num_expected_tokens} placeholders") + + is_mm_embed = is_mm_embed.to(self.device) return is_mm_embed, mm_embeds def _get_model_inputs( @@ -1371,11 +1379,15 @@ def _precompile_mm_encoder(self) -> None: hf_config.image_token_index placeholders_ids = placeholders_ids.to(self.device) + + mm_mask = torch.tensor([False] * num_tokens) + mm_mask[:items_size] = True + mm_mask = mm_mask.to(self.device) # Assign outputs or the graph will be cut short. a, b = self._get_model_inputs( placeholders_ids, mm_embed_inputs=( - torch.tensor(True, device=self.device), + mm_mask, [mm_embeds], ), ) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index fc72b954df9c..179cbcff39d3 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -159,6 +159,7 @@ def init_device(self): per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH, f"tp{world_size}_rank{rank}") xr.initialize_cache(per_rank_path, readonly=False) + logger.debug("XLA cache initialized at %s", per_rank_path) # Init ModelRunner here, so that we have access to self.device. self.model_runner = \ From 29e0ad5a205fdb56a02c078152bc50d47c5284b4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 13:57:54 +0000 Subject: [PATCH 27/45] Remove sanity check for code simplicity Signed-off-by: DarkLight1337 --- vllm/model_executor/models/utils.py | 10 ---------- vllm/v1/worker/tpu_model_runner.py | 12 ++---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 282fda18e667..2d26543e8c68 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -16,7 +16,6 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available, is_uva_available) @@ -420,15 +419,6 @@ def _merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if len(multimodal_embeddings) == 0: - # On TPU, the .any check will force a device sync and graph break - # unless we precompile faulty cases. - if not current_platform.is_tpu() and is_multimodal.any(): - num_expected_tokens = is_multimodal.sum().item() - - raise ValueError( - f"Attempted to assign 0 " - f"multimodal tokens to {num_expected_tokens} placeholders") - return inputs_embeds mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index a96790f01753..f7145e378bf1 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -944,17 +944,9 @@ def _gather_mm_embeddings( req_start_idx += num_scheduled_tokens - is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens] + is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens] \ + .to(self.device) - if not len(mm_embeds) and is_mm_embed.any(): - # Do this safety check here, on CPU, to avoid precompiling faulty - # cases to check on device tensors. - num_expected_tokens = is_mm_embed.sum().item() - raise ValueError( - f"Attempted to assign 0 " - f"multimodal tokens to {num_expected_tokens} placeholders") - - is_mm_embed = is_mm_embed.to(self.device) return is_mm_embed, mm_embeds def _get_model_inputs( From f6e7e62d50bfb2833d3d68bc472aa46bf9a1bfc4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 16:19:57 +0000 Subject: [PATCH 28/45] Update interface for all MM models Signed-off-by: DarkLight1337 --- docs/contributing/model/multimodal.md | 30 ------ vllm/model_executor/models/aria.py | 25 ++--- vllm/model_executor/models/aya_vision.py | 27 ++---- vllm/model_executor/models/blip2.py | 22 ++--- vllm/model_executor/models/chameleon.py | 24 ++--- vllm/model_executor/models/cohere2_vision.py | 27 ++---- vllm/model_executor/models/deepseek_vl2.py | 25 ++--- vllm/model_executor/models/ernie45_vl.py | 31 ++++--- vllm/model_executor/models/fuyu.py | 26 ++---- vllm/model_executor/models/gemma3_mm.py | 26 ++---- vllm/model_executor/models/gemma3n_mm.py | 23 +++-- vllm/model_executor/models/glm4_1v.py | 17 ---- vllm/model_executor/models/glm4v.py | 35 ++----- vllm/model_executor/models/granite_speech.py | 31 ++++--- .../models/hyperclovax_vision.py | 35 ++----- vllm/model_executor/models/idefics3.py | 31 ++----- vllm/model_executor/models/interfaces.py | 81 +++++++++++++++-- vllm/model_executor/models/interfaces_base.py | 7 ++ vllm/model_executor/models/interns1.py | 47 ++++++---- vllm/model_executor/models/internvl.py | 46 ++++++---- vllm/model_executor/models/keye.py | 18 ---- vllm/model_executor/models/kimi_vl.py | 29 +----- vllm/model_executor/models/llama4_eagle.py | 31 ++----- vllm/model_executor/models/llava.py | 26 ++---- vllm/model_executor/models/llava_next.py | 31 ++++--- .../model_executor/models/llava_next_video.py | 23 ++--- vllm/model_executor/models/llava_onevision.py | 13 --- vllm/model_executor/models/midashenglm.py | 25 ++--- vllm/model_executor/models/minicpmv.py | 27 ++---- vllm/model_executor/models/minimax_text_01.py | 10 +- vllm/model_executor/models/minimax_vl_01.py | 25 ++--- vllm/model_executor/models/mistral3.py | 26 ++---- vllm/model_executor/models/mllama4.py | 30 ++---- vllm/model_executor/models/molmo.py | 32 ++----- .../model_executor/models/nano_nemotron_vl.py | 43 +++------ vllm/model_executor/models/nemotron_vl.py | 37 +++++--- vllm/model_executor/models/ovis.py | 21 +---- vllm/model_executor/models/ovis2_5.py | 18 +--- vllm/model_executor/models/paligemma.py | 23 ++--- vllm/model_executor/models/phi3v.py | 44 ++++++--- vllm/model_executor/models/phi4_multimodal.py | 18 +--- vllm/model_executor/models/phi4mm.py | 14 --- vllm/model_executor/models/pixtral.py | 26 ++---- .../models/qwen2_5_omni_thinker.py | 28 +++--- vllm/model_executor/models/qwen2_5_vl.py | 13 --- vllm/model_executor/models/qwen2_audio.py | 23 ++--- vllm/model_executor/models/qwen2_vl.py | 13 --- vllm/model_executor/models/qwen3_vl.py | 91 ++++++++++++------- vllm/model_executor/models/qwen_vl.py | 25 ++--- vllm/model_executor/models/skyworkr1v.py | 36 +++++--- vllm/model_executor/models/step3_vl.py | 52 +++++------ vllm/model_executor/models/tarsier.py | 25 ++--- vllm/model_executor/models/terratorch.py | 3 + vllm/model_executor/models/transformers.py | 24 ++--- vllm/model_executor/models/ultravox.py | 36 ++++---- vllm/model_executor/models/utils.py | 58 +++--------- vllm/model_executor/models/voxtral.py | 29 ++---- vllm/model_executor/models/whisper.py | 10 +- vllm/v1/spec_decode/eagle.py | 13 +-- vllm/v1/worker/gpu_model_runner.py | 14 +-- vllm/v1/worker/tpu_model_runner.py | 29 +++--- 61 files changed, 655 insertions(+), 1073 deletions(-) diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 87d34d207cde..760fb05b8d10 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -66,36 +66,6 @@ Further update the model as follows: !!! important The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. -- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. - - ??? code - - ```python - from .utils import merge_multimodal_embeddings - - class YourModelForImage2Seq(nn.Module): - ... - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - - # `get_input_embeddings` should already be implemented for the language - # model as one of the requirements of basic vLLM model implementation. - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.image_token_index) - - return inputs_embeds - ``` - - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. ```python diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index a7cb6b35a4ab..48db5ca272d7 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -39,8 +39,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - is_pp_missing_parameter, maybe_prefix, - merge_multimodal_embeddings) + is_pp_missing_parameter, maybe_prefix) class AriaImagePixelInputs(TensorSchema): @@ -606,19 +605,6 @@ def get_multimodal_embeddings(self, multimodal_embeddings = self._process_image_input(image_input) return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.image_token_index) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -629,10 +615,11 @@ def forward( ) -> Union[torch.Tensor, IntermediateTensors]: if inputs_embeds is None: multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + multimodal_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model( diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 687c82ded9d0..bf0c827a7cdb 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -34,8 +34,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) class AyaVisionImagePixelInputs(TensorSchema): @@ -417,23 +416,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input, **kwargs) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.image_token_index, - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -449,8 +431,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model( diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c1e7a7d498b1..95fd363a04d3 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -28,7 +28,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo @@ -632,19 +632,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self._process_image_input(image_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - _IMAGE_TOKEN_ID) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -690,8 +677,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == _IMAGE_TOKEN_ID, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 7a5623648374..2662d6a14414 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -45,7 +45,7 @@ SupportsQuant) from .utils import (flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) logger = init_logger(__name__) @@ -1003,20 +1003,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self.model.get_input_embeddings(image_tokens) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - - inputs_embeds = self.model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.model.vocabulary_mapping.image_token_id) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1033,8 +1019,12 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + image_token_id = self.model.vocabulary_mapping.image_token_id + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == image_token_id, + ) input_ids = None hidden_states = self.model(input_ids, diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 179cc2af8eb3..1c6f7762fcd4 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -38,8 +38,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) class Cohere2VisionImagePixelInputs(TensorSchema): @@ -431,23 +430,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input, **kwargs) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.image_token_id, - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -463,8 +445,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_id, + ) input_ids = None hidden_states = self.language_model.model( diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index d7ae8206baca..937c3f629751 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -42,8 +42,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) # The image token id may be various _IMAGE_TOKEN = "" @@ -347,7 +346,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): model_config = vllm_config.model_config tokenizer = cached_tokenizer_from_config(model_config) - self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN] + self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN] self.vision = self._init_vision_module(self.vision_config, quant_config, @@ -606,19 +605,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self._process_image_input(image_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.image_token_id) - return inputs_embeds - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, @@ -633,8 +619,11 @@ def forward(self, # condition is for v0 compatibility elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.image_token_id, + ) input_ids = None hidden_states = self.language_model(input_ids, diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 3396c67f42b7..485eaa2c6903 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -61,8 +61,7 @@ from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix from .vision import get_vit_attn_backend logger = init_logger(__name__) @@ -1467,18 +1466,24 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is None: - return inputs_embeds - - self._set_visual_token_mask(input_ids) - inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds, - multimodal_embeddings, - [self.config.im_patch_id]) - return inputs_embeds + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: + self._set_visual_token_mask(input_ids) + + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward( self, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 90af859ab92e..12e9303adbd8 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -43,8 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 71011 @@ -343,22 +342,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - _IMAGE_TOKEN_ID, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -374,8 +357,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == _IMAGE_TOKEN_ID, + ) input_ids = None hidden_states = self.language_model( diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index bee9fbd2c084..74057532a581 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -38,8 +38,7 @@ SupportsMultiModal, SupportsPP) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) logger = init_logger(__name__) @@ -589,22 +588,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - return inputs_embeds - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, @@ -619,8 +602,11 @@ def forward(self, elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) if (vision_embeddings is not None) and len(vision_embeddings) != 0: kwargs = self.prepare_attn_masks( input_ids, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 8d3079aee0df..f317b6c452c3 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -633,8 +633,10 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache # them here, as the model forward has only access to the input_embeds. if input_ids is not None: @@ -646,15 +648,16 @@ def get_input_embeddings( self.per_layer_embeddings[:per_layer_inputs.shape[0]].copy_( per_layer_inputs) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - # NOTE: this order of processing mm items is important - [self.config.image_token_id, self.config.audio_token_id]) - return inputs_embeds + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index cbf327ce02b6..922333ff786e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1551,23 +1551,6 @@ def get_multimodal_embeddings( multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if (multimodal_embeddings is not None - and len(multimodal_embeddings) != 0 - and all(embed.numel() > 0 for embed in multimodal_embeddings)): - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - [self.config.image_token_id, self.config.video_token_id], - ) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index bf33575859ae..ace9c05daf15 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -43,7 +43,7 @@ from .chatglm import ChatGLMBaseModel, ChatGLMModel from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import flatten_bn, merge_multimodal_embeddings +from .utils import flatten_bn, isin_list class GLMVImagePixelInputs(TensorSchema): @@ -607,28 +607,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self._process_image_input(image_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.transformer.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=[ - self.config.boi_token_id, - self.config.pad_token_id, - self.config.eoi_token_id, - ], - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -644,8 +622,15 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=isin_list(input_ids, [ + self.config.boi_token_id, + self.config.pad_token_id, + self.config.eoi_token_id, + ]), + ) input_ids = None hidden_states = self.transformer(input_ids, positions, diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 221023f1fb65..3596d60a495f 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -53,8 +53,7 @@ from .blip2 import Blip2QFormerModel from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import (AutoWeightsLoader, embed_multimodal, - init_vllm_registered_model, maybe_prefix) +from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix ### Audio Input @@ -729,7 +728,7 @@ def get_multimodal_embeddings( audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: return [] - return None + audio_features = self._process_audio_input(audio_input) return audio_features @@ -737,19 +736,21 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + # Multi-modal token ID may exceed vocab size + do_language_embed_multimodal: bool = False, ) -> torch.Tensor: - """Compute the merged LLM / audio embeddings.""" - if multimodal_embeddings is None \ - or len(multimodal_embeddings) == 0: - return self.language_model.get_input_embeddings(input_ids) + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) - inputs_embeds = embed_multimodal( + return super().get_input_embeddings( input_ids, - self.config.audio_token_index, - self.language_model.model.get_input_embeddings, - multimodal_embeddings, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, ) - return inputs_embeds def forward( self, @@ -766,7 +767,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: audio_embeds = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, audio_embeds) + inputs_embeds = self.get_input_embeddings( + input_ids, + audio_embeds, + is_multimodal=input_ids == self.config.audio_token_index, + ) input_ids = None model_output = self.language_model(input_ids, positions, diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 870addd0dcbc..8511bdac6646 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -46,8 +46,8 @@ from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel -from .utils import (AutoWeightsLoader, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) +from .utils import (AutoWeightsLoader, init_vllm_registered_model, isin_list, + maybe_prefix) from .vision import get_vision_encoder_info EOT = "<|endofturn|>" @@ -692,7 +692,7 @@ def get_language_model(self) -> torch.nn.Module: def get_multimodal_embeddings( self, **kwargs: Unpack[HCXVisionMultimodalInputs], - ) -> Optional[MultiModalEmbeddings]: + ) -> MultiModalEmbeddings: multimodal_embeddings = list() if kwargs.get("pixel_values_images") is not None: @@ -737,26 +737,6 @@ def get_multimodal_embeddings( multimodal_embeddings.append(_multimodal_embeddings_videos) return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - placeholder_token_id=[ - self.config.image_token_id, - self.config.video_token_id, - ], - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -772,8 +752,13 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + multimodal_embeddings, + is_multimodal=isin_list( + input_ids, + [self.config.image_token_id, self.config.video_token_id]), + ) input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 9153a0e2c1e5..77feca137ab7 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -53,8 +53,7 @@ # yapf: enable from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .llama import LlamaModel -from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix class Idefics3ImagePixelInputs(TensorSchema): @@ -540,10 +539,7 @@ def image_pixels_to_features( return image_hidden_states - def get_input_embeddings( - self, - input_ids: torch.Tensor, - ) -> torch.Tensor: + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.text_model.get_input_embeddings(input_ids) def forward( @@ -696,22 +692,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -727,8 +707,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_id, + ) input_ids = None hidden_states = self.model.text_model(input_ids, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 6be70c4b3b21..3368b8ff2fb6 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, MutableSequence -from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, - Union, overload, runtime_checkable) +from typing import (TYPE_CHECKING, Callable, ClassVar, Literal, Optional, + Protocol, Union, overload, runtime_checkable) import numpy as np import torch @@ -20,7 +20,8 @@ QuantizationConfig) from vllm.utils import supports_kw -from .interfaces_base import is_pooling_model +from .interfaces_base import VllmModel, is_pooling_model +from .utils import _merge_multimodal_embeddings if TYPE_CHECKING: from vllm.config import VllmConfig @@ -84,7 +85,7 @@ def get_multimodal_embeddings(self, """ ... - def get_language_model(self) -> torch.nn.Module: + def get_language_model(self) -> VllmModel: """ Returns the underlying language model used for text generation. @@ -96,17 +97,81 @@ def get_language_model(self) -> torch.nn.Module: """ ... + @overload + def get_input_embeddings(self, input_ids: Tensor) -> Tensor: + ... + + @overload + def get_input_embeddings( + self, + input_ids: Tensor, + multimodal_embeddings: MultiModalEmbeddings, + *, + is_multimodal: torch.Tensor, + do_language_embed_multimodal: bool = True, + ) -> Tensor: + ... + + def _get_text_embeddings( + self, + input_ids: Tensor, + get_input_embeddings: Callable[[Tensor], Tensor], + *, + is_multimodal: Optional[Tensor], + do_language_embed_multimodal: bool, + ) -> Tensor: + if not do_language_embed_multimodal and is_multimodal is not None: + is_text = ~is_multimodal + text_embeds = get_input_embeddings(input_ids[is_text]) + + return torch.empty( + (input_ids.shape[0], text_embeds.shape[1]), + dtype=text_embeds.dtype, + device=text_embeds.device, + ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds) + + return get_input_embeddings(input_ids) + def get_input_embeddings( self, input_ids: Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> Tensor: """ - Returns the input embeddings merged from the text embeddings from - input_ids and the multimodal embeddings generated from multimodal - kwargs. + Apply token embeddings to `input_ids`. + + If `multimodal_embeddings` is passed, scatter them into + `input_ids` according to the mask `is_multimodal`. + + In case the multi-modal token IDs exceed the vocabulary size of + the language model, you can set `do_language_embed_multimodal=False` + to avoid calling the language model's `get_input_embeddings` method + on those tokens. """ - ... + inputs_embeds = self._get_text_embeddings( + input_ids, + self.get_language_model().get_input_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) + + if multimodal_embeddings is None: + return inputs_embeds + + if is_multimodal is None: + raise ValueError( + "`get_input_embeddings` now requires `is_multimodal` arg, " + "please update your model runner according to " + "https://github.com/vllm-project/vllm/pull/16229.") + + return _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) @overload diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 19a3ef1a3b80..160a6d6b2d20 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -43,6 +43,13 @@ def __init__( ) -> None: ... + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + """Apply token embeddings to `input_ids`.""" + ... + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index b59d1b88cf5c..ba3ae350c4b4 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -37,8 +37,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, isin_list, maybe_prefix) class InternS1MultiModalProjector(nn.Module): @@ -759,24 +758,24 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - assert len(context_token_ids) >= 1 + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: self._set_visual_token_mask(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - context_token_ids, - ) - return inputs_embeds + + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward( self, @@ -794,9 +793,17 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: + context_token_ids = [ + token_id for token_id in (self.img_context_token_id, + self.video_context_token_id) + if token_id is not None + ] vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=isin_list(input_ids, context_token_ids), + ) input_ids = None forward_kwargs = { diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 6a5c565b52e8..2d1b5ba43d53 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -44,7 +44,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) + isin_list, maybe_prefix) IMG_START = '' IMG_END = '' @@ -1340,24 +1340,24 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - assert len(context_token_ids) >= 1 + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: self._set_visual_token_mask(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - context_token_ids, - ) - return inputs_embeds + + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward( self, @@ -1375,9 +1375,17 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: + context_token_ids = [ + token_id for token_id in (self.img_context_token_id, + self.video_context_token_id) + if token_id is not None + ] vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=isin_list(input_ids, context_token_ids), + ) input_ids = None forward_kwargs = { diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index afe33b4d4ad2..bdc3862d7428 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1459,24 +1459,6 @@ def get_multimodal_embeddings( multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - [ - self.config.image_token_id, - self.config.video_token_id, - ], - ) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 94a5933a6141..54c8dc993a4d 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -66,7 +66,6 @@ from vllm.model_executor.models.interfaces import (SupportsMultiModal, SupportsPP) from vllm.model_executor.models.moonvit import MoonVitPretrainedModel -from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -425,26 +424,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self._process_image_input(image_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - - # `get_input_embeddings` should already be implemented for the language - # model as one of the requirements of basic vLLM model implementation. - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None and len( - multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.media_placeholder_token_id) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -463,14 +442,12 @@ def forward( if image_input is None: inputs_embeds = None else: - inputs_embeds = self.get_input_embeddings(input_ids) image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( + inputs_embeds = self.get_input_embeddings( input_ids, - inputs_embeds, image_embeds, - placeholder_token_id=self.config. - media_placeholder_token_id, + is_multimodal=input_ids == + self.config.media_placeholder_token_id, ) input_ids = None diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index a203af53205c..235275c0940a 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -37,9 +37,9 @@ from vllm.model_executor.models.llama4 import (Llama4DecoderLayer, Llama4ForCausalLM) from vllm.model_executor.models.utils import extract_layer_index -from vllm.multimodal.inputs import NestedTensors -from .utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings +from .interfaces import SupportsMultiModal +from .utils import AutoWeightsLoader, maybe_prefix logger = init_logger(__name__) @@ -79,10 +79,7 @@ def __init__( self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - ) -> torch.Tensor: + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) def forward( @@ -194,6 +191,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.config.vocab_size, scale=logit_scale) + def get_language_model(self) -> torch.nn.Module: + return self.model + + get_input_embeddings = SupportsMultiModal.get_input_embeddings # type: ignore + def forward( self, input_ids: torch.Tensor, @@ -220,20 +222,3 @@ def transform(inputs): skip_prefixes=(["lm_head."]), ) loader.load_weights(map(transform, weights)) - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - inputs_embeds = self.model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - - return inputs_embeds diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 9591deea06ce..ff4b63031a03 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -41,8 +41,7 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vision_encoder_info @@ -678,22 +677,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -746,8 +729,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5e82f9799e0f..e956a95f737a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -26,8 +26,8 @@ LlavaDummyInputsBuilder, LlavaLikeConfig, LlavaMultiModalProjector, init_vision_tower_for_llava) from .siglip import SiglipVisionModel -from .utils import (AutoWeightsLoader, WeightsMapper, embed_multimodal, - flatten_bn, init_vllm_registered_model, maybe_prefix) +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix) class LlavaNextImagePixelInputs(TensorSchema): @@ -475,19 +475,21 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + # Multi-modal token ID may exceed vocab size + do_language_embed_multimodal: bool = False, ) -> torch.Tensor: + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) - if multimodal_embeddings is None \ - or len(multimodal_embeddings) == 0: - return self.language_model.get_input_embeddings(input_ids) - - inputs_embeds = embed_multimodal( + return super().get_input_embeddings( input_ids, - self.config.image_token_index, - self.language_model.model.get_input_embeddings, - multimodal_embeddings, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, ) - return inputs_embeds def forward( self, @@ -549,8 +551,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index cf9852de633f..61328d823071 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -31,8 +31,7 @@ from .llava import init_vision_tower_for_llava from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vision_encoder_info @@ -416,19 +415,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self._process_video_pixels(video_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.video_token_index) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -450,8 +436,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.video_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 46d54452a52d..bba08c9236a8 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -851,19 +851,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - [self.config.image_token_index, self.config.video_token_index]) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 140800dd41c7..7d287104f321 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -55,8 +55,7 @@ from vllm.transformers_utils.configs.midashenglm import DashengConfig from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix _Tuple2 = Union[int, tuple[int, int], Sequence[int]] @@ -745,21 +744,6 @@ def get_multimodal_embeddings(self, return [] return self._process_audio_input(audio_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.decoder.get_input_embeddings(input_ids) - if multimodal_embeddings and len(multimodal_embeddings) > 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.audio_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -772,8 +756,11 @@ def forward( inputs_embeds = None elif inputs_embeds is None: multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + multimodal_embeddings, + is_multimodal=input_ids == self.config.audio_token_id, + ) input_ids = None return self.decoder.model(input_ids, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 9b2d84e32151..2b330f8acd74 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -72,8 +72,7 @@ from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) -from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, flatten_bn, isin_list, maybe_prefix # For profile run _MAX_FRAMES_PER_VIDEO = 16 @@ -1145,23 +1144,6 @@ def get_multimodal_embeddings(self, return self._process_multimodal_inputs(modalities) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.llm.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - assert len(self.mm_token_ids) > 0 - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - list(self.mm_token_ids), - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -1179,8 +1161,11 @@ def forward( elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=isin_list(input_ids, list(self.mm_token_ids)), + ) input_ids = None hidden_states = self.llm.model( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 6ce883be0a83..7e664293a43c 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -601,10 +601,7 @@ def _clear_prefill_cache(self, attn_metadata, dtype=torch.long) minimax_cache_tensors[:, slots_tensor, ...] = 0 - def get_input_embeddings( - self, - input_ids: torch.Tensor, - ) -> torch.Tensor: + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) def forward(self, @@ -725,10 +722,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs( batch_size) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - ) -> torch.Tensor: + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) def forward(self, diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index cc7db849a28b..e369340c3746 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -29,7 +29,7 @@ from .pixtral import PixtralHFVisionModel from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) class MiniMaxVL01ImagePixelInputs(TensorSchema): @@ -219,22 +219,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - return inputs_embeds - def get_language_model(self) -> torch.nn.Module: return self.language_model @@ -406,8 +390,11 @@ def forward( inputs_embeds = None elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index d15776a39362..8a058da996d6 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -38,8 +38,7 @@ SupportsMultiModal, SupportsPP) from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vision_encoder_info @@ -524,22 +523,6 @@ def get_multimodal_embeddings(self, return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -592,8 +575,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 2f0e8a2a5e57..fdb2b943e65f 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -44,7 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems, NestedTensors) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -57,8 +57,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llama4 import Llama4ForCausalLM -from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, - merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix class Llama4ImagePatchInputs(TensorSchema): @@ -813,24 +812,6 @@ def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings: return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None and len( - multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -846,8 +827,11 @@ def forward( # this condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None return self.language_model(input_ids, positions, intermediate_tensors, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2475fe131609..548e6765507c 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -59,7 +59,7 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) # TODO: hard-coded for now. Consider making it configurable. VIT_LAYERS = [-2, -9] @@ -820,10 +820,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - ) -> torch.Tensor: + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) def forward( @@ -1482,24 +1479,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - assert self.img_patch_id is not None - - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.img_patch_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.LongTensor, @@ -1516,8 +1495,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.img_patch_id, + ) input_ids = None hidden_states = self.model(input_ids, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 4f8652c00694..b71101484b77 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -35,8 +35,7 @@ from vllm.model_executor.models.radio import RadioModel from vllm.model_executor.models.utils import (flatten_bn, init_vllm_registered_model, - maybe_prefix, - merge_multimodal_embeddings) + isin_list, maybe_prefix) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -1097,8 +1096,8 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: return modalities - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: # Validate the multimodal input keyword arguments modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if modalities is None: @@ -1122,30 +1121,6 @@ def get_multimodal_embeddings( return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if (multimodal_embeddings is not None - and len(multimodal_embeddings) != 0): - context_token_ids = [ - token_id for token_id in (self.img_context_token_id, - self.video_context_token_id) - if token_id is not None - ] - assert len(context_token_ids) >= 1 - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - context_token_ids, - ) - - return inputs_embeds - def get_language_model(self) -> torch.nn.Module: return self.language_model @@ -1164,9 +1139,17 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: + context_token_ids = [ + token_id for token_id in (self.img_context_token_id, + self.video_context_token_id) + if token_id is not None + ] vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=isin_list(input_ids, context_token_ids), + ) input_ids = None hidden_states = self.language_model( diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index acda2027401d..c08204f75187 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -39,7 +39,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) IMG_START = '' IMG_END = '' @@ -577,20 +577,24 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - context_token_ids = [self.img_context_token_id] - assert len(context_token_ids) >= 1 + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: self._set_visual_token_mask(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - context_token_ids, - ) - return inputs_embeds + + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward( self, @@ -609,8 +613,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.img_context_token_id, + ) input_ids = None forward_kwargs = { diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index f1bb18716b40..cdab4c75b407 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -52,7 +52,6 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import merge_multimodal_embeddings # Cannot find the following number from hf config. IMAGE_TOKEN = "" @@ -513,19 +512,6 @@ def get_multimodal_embeddings(self, return image_features - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.llm.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.image_pad_token_id) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -541,8 +527,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.image_pad_token_id, + ) input_ids = None # up until here we have an inputs_embeds 100% numerical identity diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 5e4758ef8ea5..7cbe8ae3b177 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -586,17 +586,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.llm.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - tmp = torch.concat(multimodal_embeddings, dim=0) - inputs_embeds[input_ids == self.image_pad_token_id] = tmp - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -613,8 +602,11 @@ def forward( elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.image_pad_token_id, + ) input_ids = None # up until here we have a inputs_embeds 100% numerical identity diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index d6eec77ebcee..ac1bec585821 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -27,8 +27,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) from .vision import get_vision_encoder_info logger = init_logger(__name__) @@ -363,19 +362,6 @@ def get_multimodal_embeddings(self, vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.image_token_index) - return inputs_embeds - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, @@ -389,8 +375,11 @@ def forward(self, # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4522c7043d01..0b547eccd378 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -52,9 +52,9 @@ from .clip import CLIPVisionModel from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP, SupportsQuant) -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) +from .utils import (AutoWeightsLoader, WeightsMapper, + _merge_multimodal_embeddings, flatten_bn, + init_vllm_registered_model, maybe_prefix) logger = init_logger(__name__) @@ -644,14 +644,31 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.image_token_id) - return inputs_embeds + inputs_embeds = self._get_text_embeddings( + input_ids, + self.embed_tokens, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) + + if multimodal_embeddings is None: + return inputs_embeds + + if is_multimodal is None: + raise ValueError( + "`get_input_embeddings` now requires `is_multimodal` arg, " + "please update your model runner according to " + "https://github.com/vllm-project/vllm/pull/16229.") + + return _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) def forward(self, input_ids: torch.Tensor, @@ -667,8 +684,11 @@ def forward(self, # condition is for v0 compatibility elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=self.image_token_id, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 25df9e9261d9..a0d2b0cbc689 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -1342,12 +1342,12 @@ def _process_image_input( image_attention_mask) return image_embeds - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: - return None + return [] # The result multimodal_embeddings is tuple of tensors, with each # tensor corresponding to a multimodal data item (image or video). @@ -1371,18 +1371,6 @@ def get_multimodal_embeddings( return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID]) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index b3fc55dab6ec..cf0d925bf151 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1152,7 +1152,6 @@ def get_multimodal_embeddings(self, modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return [] - return None # The result multimodal_embeddings is tuple of tensors, with each # tensor corresponding to a multimodal data item (image or video). @@ -1176,19 +1175,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.model.embed_tokens(input_ids) - if multimodal_embeddings is not None and len( - multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID]) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 142d3251bc67..47a18ee87616 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -51,8 +51,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) +from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs try: @@ -434,22 +433,6 @@ def get_multimodal_embeddings(self, return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.vision_args.image_token_id, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -466,8 +449,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.vision_args.image_token_id, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index a7e71309b607..351154287dab 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -866,24 +866,26 @@ def get_multimodal_embeddings(self, multimodal_embeddings += audio_embeddings return multimodal_embeddings + # TODO (ywang96): support overlapping modality embeddings so that + # `use_audio_in_video` will work on V1. def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - - # TODO (ywang96): support overlapping modality embeddings so that - # `use_audio_in_video` will work on V1. - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, [ - self.config.image_token_index, - self.config.video_token_index, - self.config.audio_token_index - ]) - return inputs_embeds + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def get_multimodal_embeddings_v0( self, **kwargs: object) -> Optional[NestedTensors]: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index dbf486374bcf..1724c9105266 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1164,19 +1164,6 @@ def get_multimodal_embeddings(self, multimodal_embeddings += video_embeddings return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - [self.config.image_token_id, self.config.video_token_id]) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index c797b71b5d2e..48fe2371e083 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -50,8 +50,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) +from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix # # === Audio Inputs === # @@ -439,19 +438,6 @@ def get_multimodal_embeddings(self, masked_audio_features = self._process_audio_input(audio_input) return masked_audio_features - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.audio_token_index) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -468,8 +454,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + multimodal_embeddings, + is_multimodal=input_ids == self.config.audio_token_index, + ) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7f361678ba72..4c3ddbaf0943 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1428,19 +1428,6 @@ def get_multimodal_embeddings(self, return multimodal_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - [self.config.image_token_id, self.config.video_token_id]) - return inputs_embeds - def get_input_embeddings_v0( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c224b78e2c27..5d8f4dcd2e63 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -83,7 +83,8 @@ from .qwen2_vl import Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - maybe_prefix, merge_multimodal_embeddings) + _merge_multimodal_embeddings, maybe_prefix, + merge_multimodal_embeddings) from .vision import get_vit_attn_backend logger = init_logger(__name__) @@ -1307,17 +1308,22 @@ def get_multimodal_embeddings( return multimodal_embeddings def _compute_deepstack_embeds( - self, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings) -> torch.Tensor: - visual_lens = [ - x.shape[0] if isinstance(x, torch.Tensor) else len(x) - for x in multimodal_embeddings - ] + self, + inputs_embeds: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings, + is_multimodal: torch.Tensor, + ) -> tuple[torch.Tensor, MultiModalEmbeddings]: + visual_lens = [len(x) for x in multimodal_embeddings] multimodal_embeddings_cat = torch.cat(multimodal_embeddings, dim=0) - multimodal_embeddings_main, multimodal_embeddings_multiscale = torch.split( # noqa:E501 - multimodal_embeddings_cat, [self.visual_dim, self.multiscale_dim], - dim=-1) + ( + multimodal_embeddings_main, + multimodal_embeddings_multiscale, + ) = torch.split( + multimodal_embeddings_cat, + [self.visual_dim, self.multiscale_dim], + dim=-1, + ) multimodal_embeddings = torch.split(multimodal_embeddings_main, visual_lens, @@ -1329,39 +1335,62 @@ def _compute_deepstack_embeds( inputs_embeds.size(0), self.deepstack_num_level * inputs_embeds.size(1)) - deepstack_input_embeds = merge_multimodal_embeddings( - input_ids, - deepstack_input_embeds, - multimodal_embeddings_multiscale, - placeholder_token_id=[ - self.config.image_token_id, self.config.video_token_id - ], + deepstack_input_embeds = _merge_multimodal_embeddings( + inputs_embeds=deepstack_input_embeds, + multimodal_embeddings=multimodal_embeddings_multiscale, + is_multimodal=is_multimodal, ) deepstack_input_embeds = deepstack_input_embeds.view( inputs_embeds.shape[0], self.deepstack_num_level, self.visual_dim) deepstack_input_embeds = deepstack_input_embeds.permute(1, 0, 2) + return deepstack_input_embeds, multimodal_embeddings def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - deepstack_input_embeds = None - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - if self.use_deepstack: - deepstack_input_embeds, multimodal_embeddings = self._compute_deepstack_embeds( # noqa:E501 - input_ids, inputs_embeds, multimodal_embeddings) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - [self.config.image_token_id, self.config.video_token_id]) + inputs_embeds = self._get_text_embeddings( + input_ids, + self.language_model.get_input_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) + + if multimodal_embeddings is None: + return inputs_embeds + + if is_multimodal is None: + raise ValueError( + "`get_input_embeddings` now requires `is_multimodal` arg, " + "please update your model runner according to " + "https://github.com/vllm-project/vllm/pull/16229.") if self.use_deepstack: - if deepstack_input_embeds is None: - deepstack_input_embeds = torch.zeros_like( - inputs_embeds).unsqueeze(0).repeat( - self.deepstack_num_level, 1, 1).contiguous() + ( + deepstack_input_embeds, + multimodal_embeddings, + ) = self._compute_deepstack_embeds( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + else: + deepstack_input_embeds = None + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + if deepstack_input_embeds is not None: + deepstack_input_embeds = torch.zeros_like(inputs_embeds).unsqueeze( + 0).repeat(self.deepstack_num_level, 1, 1).contiguous() self._set_deepstack_input_embeds(deepstack_input_embeds) return inputs_embeds @@ -1528,4 +1557,4 @@ def get_mm_mapping(self) -> MultiModelKeys: language_model="language_model", connector="model.visual.merger", tower_model="model.visual.", - ) \ No newline at end of file + ) diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 90200f319464..dc11b60604a9 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -45,7 +45,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .qwen import QWenBaseModel, QWenModel -from .utils import flatten_bn, merge_multimodal_embeddings +from .utils import flatten_bn class QwenImagePixelInputs(TensorSchema): @@ -756,21 +756,6 @@ def get_multimodal_embeddings(self, vision_embeddings = self._process_image_input(image_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.transformer.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.transformer.visual.image_pad_id) - - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -786,8 +771,12 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == + self.transformer.visual.image_pad_id, + ) input_ids = None hidden_states = self.transformer(input_ids, positions, diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 9857ccdcbe2d..bac98d4b9e3a 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -39,7 +39,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) IMG_START = '' IMG_END = '' @@ -843,19 +843,24 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - assert self.img_context_token_id is not None + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: self._set_visual_token_mask(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.img_context_token_id, - ) - return inputs_embeds + + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward( self, @@ -874,8 +879,11 @@ def forward( # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.img_context_token_id, + ) input_ids = None forward_kwargs = { diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 2ba5f94ea3b8..9445e3b6dc53 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargsItems, NestedTensors) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -41,8 +41,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) class Step3VLImagePixelInputs(TypedDict): @@ -1006,10 +1005,13 @@ def _process_image_input( 1 else cur_feature[0]) return merged_image_features - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: - return None + return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings @@ -1017,24 +1019,21 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + # Multi-modal token ID may exceed vocab size + do_language_embed_multimodal: bool = False, ) -> torch.Tensor: - if multimodal_embeddings is None: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - else: - is_text = input_ids != self.config.image_token_id - text_ids = input_ids[is_text] - text_embeds = self.language_model.model.get_input_embeddings( - text_ids) - inputs_embeds = torch.empty(input_ids.shape[0], - text_embeds.shape[-1], - dtype=text_embeds.dtype, - device=text_embeds.device) - inputs_embeds[is_text] = text_embeds - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.image_token_id) - return inputs_embeds + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward( self, @@ -1048,10 +1047,11 @@ def forward( inputs_embeds = None elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_id, + ) input_ids = None hidden_states = self.language_model(input_ids, diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index c66867315e55..6113811dd279 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -41,7 +41,7 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - maybe_prefix, merge_multimodal_embeddings) + maybe_prefix) from .vision import VisionEncoderInfo, get_vision_encoder_info @@ -597,22 +597,6 @@ def get_multimodal_embeddings(self, return [] return self._process_image_input(image_input) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_index, - ) - return inputs_embeds - def forward( self, input_ids: torch.Tensor, @@ -625,8 +609,11 @@ def forward( inputs_embeds = None elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_index, + ) input_ids = None hidden_states = self.language_model.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index b9dfa8e9b6f5..dcd71a94f5c9 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -233,6 +233,9 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: # We do not really use any input tokens and therefore no embeddings # to be calculated. However, due to the mandatory token ids in diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 4f51441e28ef..9e6856bc29bd 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -809,13 +809,19 @@ def forward( multimodal_embeds = self.get_multimodal_embeddings(**kwargs) if multimodal_embeds is not None: inputs_embeds = self.get_input_embeddings( - input_ids, multimodal_embeds) + input_ids, + multimodal_embeds, + is_multimodal=input_ids == self.config.image_token_id, + ) input_ids = None model_output = super().forward(input_ids, positions, intermediate_tensors, inputs_embeds) return model_output + def get_language_model(self) -> torch.nn.Module: + return self.model + def get_multimodal_embeddings(self, **kwargs): pixel_values = kwargs.pop("pixel_values", None) pixel_values = pixel_values if pixel_values is not None else kwargs.pop( @@ -866,19 +872,3 @@ def get_multimodal_embeddings(self, **kwargs): ] return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings=None, - ) -> torch.Tensor: - inputs_embeds = self.model.get_input_embeddings()(input_ids) - if (multimodal_embeddings is not None - and len(multimodal_embeddings) != 0): - mask = (input_ids == self.config.image_token_id) - mask = mask.unsqueeze(-1).expand_as(inputs_embeds) - multimodal_embeddings = torch.cat(multimodal_embeddings) - - inputs_embeds = inputs_embeds.masked_scatter( - mask, multimodal_embeddings) - return inputs_embeds diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f1f11c5fe8f0..1eeb3fc5332d 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -34,8 +34,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix) _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 @@ -556,19 +555,21 @@ def get_input_embeddings( self, input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + # Multi-modal token ID may exceed vocab size + do_language_embed_multimodal: bool = False, ) -> torch.Tensor: - # The audio token index is not included in the embedding table - # We need to remove it before embedding lookup - safe_input_ids = input_ids.clone() - safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0 - inputs_embeds = self.language_model.get_input_embeddings( - safe_input_ids) - if multimodal_embeddings is not None and len( - multimodal_embeddings) > 0: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.config.audio_token_index) - return inputs_embeds + # This is to satisfy the type checker for each overload + if multimodal_embeddings is None or is_multimodal is None: + return super().get_input_embeddings(input_ids) + + return super().get_input_embeddings( + input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) def forward(self, input_ids: torch.Tensor, @@ -602,8 +603,11 @@ def forward(self, elif inputs_embeds is None: multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + multimodal_embeddings, + is_multimodal=input_ids == self.config.audio_token_index, + ) input_ids = None language_model = self.language_model diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index f207d8b1ca32..d545756b9994 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -4,7 +4,7 @@ import itertools from collections.abc import Iterable, Mapping from dataclasses import dataclass, field -from typing import Any, Callable, Literal, Optional, Protocol, Union, overload +from typing import Any, Literal, Optional, Protocol, Union, overload import torch import torch.nn as nn @@ -391,8 +391,8 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, - is_multimodal: torch.Tensor, multimodal_embeddings: NestedTensors, + is_multimodal: torch.Tensor, ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the @@ -433,42 +433,6 @@ def _merge_multimodal_embeddings( return inputs_embeds -def embed_multimodal( - input_ids: torch.Tensor, - multimodal_token_id: int, - get_text_embeds: Callable[[torch.Tensor], torch.Tensor], - multimodal_embeds: NestedTensors, -) -> torch.Tensor: - """ - Embed token IDs and multimodal inputs and combine their embeddings. - - ``multimodal_token_id`` is used to determine whether a token ID should - be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``. - - Compared to ``merge_multimodal_embeddings`, this avoids running - ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]`` - which causes issues when the placeholder token ID exceeds the - vocabulary size of the language model. - """ - is_multimodal = input_ids == multimodal_token_id - is_text = ~is_multimodal - - text_embeds = get_text_embeds(input_ids[is_text]) - merged_embeds = torch.empty( - (input_ids.shape[0], text_embeds.shape[1]), - dtype=text_embeds.dtype, - device=text_embeds.device, - ) - - merged_embeds[is_text] = text_embeds - - return _merge_multimodal_embeddings( - merged_embeds, - is_multimodal, - multimodal_embeds, - ) - - def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, @@ -501,13 +465,9 @@ def merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if isinstance(placeholder_token_id, list): - placeholder_token_id = torch.tensor( - placeholder_token_id, - pin_memory=is_pin_memory_available()).to(device=input_ids.device, - non_blocking=True) return _merge_multimodal_embeddings( inputs_embeds, - torch.isin(input_ids, placeholder_token_id), + isin_list(input_ids, placeholder_token_id), multimodal_embeddings, ) @@ -518,6 +478,18 @@ def merge_multimodal_embeddings( ) +def isin_list( + elements: torch.Tensor, + test_elements_list: list[int], +) -> torch.Tensor: + test_elements = torch.tensor( + test_elements_list, + pin_memory=is_pin_memory_available(), + ).to(device=elements.device, non_blocking=True) + + return torch.isin(elements, test_elements) + + class LayerFn(Protocol): def __call__(self, prefix: str) -> torch.nn.Module: diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 16a97389cd21..22e00a649db6 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -46,10 +46,8 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer, cached_tokenizer_from_config) -from .interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsTranscription) -from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, - merge_multimodal_embeddings) +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription +from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix logger = init_logger(__name__) @@ -377,9 +375,14 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: + audio_encoder = self.tokenizer.instruct.audio_encoder + audio_tok_id = audio_encoder.audio_token audio_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - audio_embeddings) + inputs_embeds = self.get_input_embeddings( + input_ids, + audio_embeddings, + is_multimodal=input_ids == audio_tok_id, + ) input_ids = None hidden_states = self.language_model.model(input_ids, @@ -422,20 +425,6 @@ def get_multimodal_embeddings( return audio_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - audio_encoder = self.tokenizer.instruct.audio_encoder - audio_tok_id = audio_encoder.audio_token - - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, audio_tok_id) - return inputs_embeds - def _parse_and_validate_audio_arrays( self, **kwargs: object) -> Union[list[torch.Tensor], None]: audio_arrays = kwargs.pop("audio_arrays", None) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 41ae7b129782..7b559041071f 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -580,10 +580,7 @@ def forward( hidden_states = self.layer_norm(hidden_states) return hidden_states - def get_input_embeddings( - self, - input_ids: torch.Tensor, - ) -> torch.Tensor: + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -917,7 +914,10 @@ def get_multimodal_embeddings(self, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, ) -> torch.Tensor: # This method just returns the decoder sequence embeddings since # Whisper does not have encoder text tokens. diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5fd688753d43..0344cf9f22a6 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -18,7 +18,6 @@ from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM -from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -172,8 +171,8 @@ def propose( last_token_indices: Optional[torch.Tensor], common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, - mm_embed_inputs: Optional[tuple[torch.Tensor, - list[torch.Tensor]]] = None, + mm_embed_inputs: Optional[tuple[list[torch.Tensor], + torch.Tensor]] = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] @@ -218,14 +217,12 @@ def propose( self.hidden_states[:num_tokens] = target_hidden_states if self.supports_mm_inputs: - assert mm_embed_inputs is not None, ( - "Multi-modal embeddings should be passed from model runner") - is_mm_embed, mm_embeds = mm_embed_inputs + mm_embeds, is_mm_embed = mm_embed_inputs or (None, None) - self.inputs_embeds[:num_tokens] = _merge_multimodal_embeddings( + self.inputs_embeds[:num_tokens] = self.model.get_input_embeddings( self.input_ids[:num_tokens], - is_mm_embed, multimodal_embeddings=mm_embeds, + is_multimodal=is_mm_embed, ) input_ids = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 41f5a22b5c9c..0cfe95f00fb9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -46,7 +46,6 @@ supports_transcription) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) -from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, PlaceholderRange) @@ -1559,7 +1558,7 @@ def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", shift_computed_tokens: int = 0, - ) -> tuple[torch.Tensor, list[torch.Tensor]]: + ) -> tuple[list[torch.Tensor], torch.Tensor]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens is_mm_embed = self.is_mm_embed.cpu @@ -1620,7 +1619,7 @@ def _gather_mm_embeddings( is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens) - return is_mm_embed, mm_embeds + return mm_embeds, is_mm_embed def _extract_encoder_inputs( self, @@ -1905,19 +1904,16 @@ def _preprocess( and not self.model_config.is_encoder_decoder): # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) - is_mm_embed, mm_embeds = self._gather_mm_embeddings( + mm_embeds, is_mm_embed = self._gather_mm_embeddings( scheduler_output) # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. inputs_embeds_scheduled = self.model.get_input_embeddings( - self.input_ids.gpu[:num_scheduled_tokens]) - - inputs_embeds_scheduled = _merge_multimodal_embeddings( - inputs_embeds_scheduled, - is_mm_embed, + self.input_ids.gpu[:num_scheduled_tokens], multimodal_embeddings=mm_embeds, + is_multimodal=is_mm_embed, ) # TODO(woosuk): Avoid the copy. Optimize. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index f7145e378bf1..a8a32068b400 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -32,7 +32,6 @@ from vllm.model_executor.models.interfaces import supports_transcription from vllm.model_executor.models.interfaces_base import ( is_pooling_model, is_text_generation_model) -from vllm.model_executor.models.utils import _merge_multimodal_embeddings from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem, PlaceholderRange) @@ -883,7 +882,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", - ) -> tuple[torch.Tensor, list[torch.Tensor]]: + ) -> tuple[list[torch.Tensor], torch.Tensor]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens padded_total_num_scheduled_tokens = _get_padded_token_len( self.num_tokens_paddings, total_num_scheduled_tokens) @@ -947,27 +946,24 @@ def _gather_mm_embeddings( is_mm_embed = is_mm_embed[:padded_total_num_scheduled_tokens] \ .to(self.device) - return is_mm_embed, mm_embeds + return mm_embeds, is_mm_embed def _get_model_inputs( self, input_ids: torch.Tensor, - mm_embed_inputs: Optional[tuple[torch.Tensor, list[torch.Tensor]]], + mm_embed_inputs: Optional[tuple[list[torch.Tensor], torch.Tensor]], ): if self.supports_mm_inputs: + mm_embeds, is_mm_embed = mm_embed_inputs or (None, None) + # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - inputs_embeds = self.model.get_input_embeddings(input_ids) - - if mm_embed_inputs: - is_mm_embed, mm_embeds = mm_embed_inputs - - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds, - is_mm_embed, - multimodal_embeddings=mm_embeds, - ) + inputs_embeds = self.model.get_input_embeddings( + input_ids, + multimodal_embeddings=mm_embeds, + is_multimodal=is_mm_embed, + ) return None, inputs_embeds else: @@ -1378,10 +1374,7 @@ def _precompile_mm_encoder(self) -> None: # Assign outputs or the graph will be cut short. a, b = self._get_model_inputs( placeholders_ids, - mm_embed_inputs=( - mm_mask, - [mm_embeds], - ), + mm_embed_inputs=([mm_embeds], mm_mask), ) assert a is None xm.mark_step() From 74a4d5ffe0bbea3b5b84118a5ba370f77e3a676b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 19 Sep 2025 16:23:52 +0000 Subject: [PATCH 29/45] Avoid circular import Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 3368b8ff2fb6..edf752280f2e 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -21,7 +21,6 @@ from vllm.utils import supports_kw from .interfaces_base import VllmModel, is_pooling_model -from .utils import _merge_multimodal_embeddings if TYPE_CHECKING: from vllm.config import VllmConfig @@ -151,6 +150,8 @@ def get_input_embeddings( to avoid calling the language model's `get_input_embeddings` method on those tokens. """ + from .utils import _merge_multimodal_embeddings + inputs_embeds = self._get_text_embeddings( input_ids, self.get_language_model().get_input_embeddings, From 6d3a733443e7777b40b86e9f2ca07f758ff77cb1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 06:55:25 +0000 Subject: [PATCH 30/45] Fix `get_input_embeddings` Signed-off-by: DarkLight1337 --- vllm/model_executor/models/granite_speech.py | 3 ++ vllm/model_executor/models/hunyuan_v1.py | 3 ++ vllm/model_executor/models/lfm2.py | 3 ++ vllm/model_executor/models/phi4flash.py | 3 ++ vllm/model_executor/models/transformers.py | 50 +++++++++++++++++++- 5 files changed, 60 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 3596d60a495f..ff4240c14047 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -720,6 +720,9 @@ def _process_audio_input( # Split variable length features into a tuple return torch.split(masked_embeds, audio_input["audio_embed_sizes"]) + def get_language_model(self) -> torch.nn.Module: + return self.language_model + def get_multimodal_embeddings( self, **kwargs: object, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 4110c8a1fd08..c125c86f4c94 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -990,6 +990,9 @@ def update_physical_experts_metadata( moe.n_redundant_experts = self.num_redundant_experts moe.experts.update_expert_map() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 927f78c4e4b4..780c497cf753 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -530,6 +530,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index c4548ee168bd..b7e7ca2894ff 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -639,6 +639,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size, logits_as_input=False) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 9e6856bc29bd..e4eec4801f02 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -53,8 +53,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, - SupportsQuant) +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, flatten_bn, make_empty_intermediate_tensors_factory, maybe_prefix) @@ -731,6 +731,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings()(input_ids) + def compute_logits( self, hidden_states: torch.Tensor, @@ -872,3 +875,46 @@ def get_multimodal_embeddings(self, **kwargs): ] return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + *, + is_multimodal: Optional[torch.Tensor] = None, + do_language_embed_multimodal: bool = True, + ) -> torch.Tensor: + """ + Apply token embeddings to `input_ids`. + + If `multimodal_embeddings` is passed, scatter them into + `input_ids` according to the mask `is_multimodal`. + + In case the multi-modal token IDs exceed the vocabulary size of + the language model, you can set `do_language_embed_multimodal=False` + to avoid calling the language model's `get_input_embeddings` method + on those tokens. + """ + from .utils import _merge_multimodal_embeddings + + inputs_embeds = self._get_text_embeddings( + input_ids, + self.model.get_input_embeddings(), + is_multimodal=is_multimodal, + do_language_embed_multimodal=do_language_embed_multimodal, + ) + + if multimodal_embeddings is None: + return inputs_embeds + + if is_multimodal is None: + raise ValueError( + "`get_input_embeddings` now requires `is_multimodal` arg, " + "please update your model runner according to " + "https://github.com/vllm-project/vllm/pull/16229.") + + return _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) From d30a4a69bc282017d40bc9c0a15afe270e208f43 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 07:00:42 +0000 Subject: [PATCH 31/45] Improve logging for unimpl methods Signed-off-by: DarkLight1337 --- vllm/config/__init__.py | 7 ++++++- vllm/model_executor/models/interfaces_base.py | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 45504e010d68..ab5aa3a1e644 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -671,9 +671,14 @@ def _task_to_convert(task: TaskOption) -> ConvertType: else: # task == "auto" pass else: + info = { + "architectures": architectures, + "is_generative_model": is_generative_model, + "is_pooling_model": is_pooling_model, + } raise AssertionError("The model should be a generative or " "pooling model when task is set to " - f"{self.task!r}.") + f"{self.task!r}. Found: {info}") self.runner = runner self.convert = convert diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 160a6d6b2d20..a7b1fc24fee6 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -63,6 +63,19 @@ def _check_vllm_model_init(model: Union[type[object], object]) -> bool: return supports_kw(model_init, "vllm_config") +def _check_vllm_model_get_input_embeddings( + model: Union[type[object], object]) -> bool: + model_get_input_embeddings = getattr(model, "get_input_embeddings", None) + if not callable(model_get_input_embeddings): + logger.warning( + "The model (%s) is missing the `get_input_embeddings` method.", + model, + ) + return False + + return True + + def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): @@ -97,7 +110,9 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: def is_vllm_model( model: Union[type[object], object], ) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]: - return _check_vllm_model_init(model) and _check_vllm_model_forward(model) + return (_check_vllm_model_init(model) + and _check_vllm_model_get_input_embeddings(model) + and _check_vllm_model_forward(model)) @runtime_checkable From 028aedf62adee9d2a1e99d4cda7c1ce1c997c8cc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 08:22:15 +0000 Subject: [PATCH 32/45] More fixes Signed-off-by: DarkLight1337 --- vllm/model_executor/models/bert.py | 6 ++++++ vllm/model_executor/models/bert_with_rope.py | 6 ++++++ vllm/model_executor/models/chatglm.py | 3 +++ vllm/model_executor/models/deepseek_eagle.py | 6 ++++++ vllm/model_executor/models/deepseek_mtp.py | 6 ++++++ vllm/model_executor/models/ernie_mtp.py | 6 ++++++ vllm/model_executor/models/glm4_moe_mtp.py | 6 ++++++ vllm/model_executor/models/llama_eagle.py | 6 ++++++ vllm/model_executor/models/llama_eagle3.py | 6 ++++++ vllm/model_executor/models/mimo_mtp.py | 6 ++++++ vllm/model_executor/models/modernbert.py | 9 +++++++++ vllm/model_executor/models/olmo2.py | 6 ++++++ vllm/model_executor/models/roberta.py | 3 +++ vllm/model_executor/models/solar.py | 3 +++ 14 files changed, 78 insertions(+) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index ee32587f6b1b..3517d54a56b3 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -348,6 +348,9 @@ def __init__( self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder") + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -457,6 +460,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "model")) self.pooler = self._build_pooler(pooler_config) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index bfc1408ddf88..4e1eba32d259 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -426,6 +426,9 @@ def __init__(self, prefix=f"{prefix}.encoder") self.pooler = BertPooler(self.config) if add_pooling_layer else None + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -673,6 +676,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loaded_params = loader.load_weights(weights) return loaded_params + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.new.get_input_embeddings(input_ids) + def forward( self, input_ids: Optional[torch.Tensor], diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 1fc2da3e4d7c..d1900e2f2799 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -434,6 +434,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 2770ddebc48a..009a679777f4 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -67,6 +67,9 @@ def __init__( self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -206,6 +209,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.config.vocab_size, scale=logit_scale) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 8fbf16d206a8..8793cd415c1a 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -102,6 +102,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(config.vocab_size) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -145,6 +148,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix( prefix, "model")) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py index 57c534887437..cb8f8e1d4105 100644 --- a/vllm/model_executor/models/ernie_mtp.py +++ b/vllm/model_executor/models/ernie_mtp.py @@ -118,6 +118,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(config.vocab_size) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -165,6 +168,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 322c5619c178..0fd9796251a7 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -133,6 +133,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.logits_processor = LogitsProcessor(config.vocab_size) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -176,6 +179,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix( prefix, "model")) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index dfae3c3ea543..c196dde38ae5 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -73,6 +73,9 @@ def __init__( self.config.hidden_size, bias=False) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -144,6 +147,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.config.vocab_size, scale=logit_scale) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 7027138dfcb1..2f9eb3d0e1ed 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -146,6 +146,9 @@ def __init__( eps=self.config.rms_norm_eps, ) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -228,6 +231,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): requires_grad=False, ) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index 09194e9f95d0..2e6f67a71339 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -118,6 +118,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.vocab_size) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -161,6 +164,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config.hidden_size, prefix=maybe_prefix(prefix, "lm_head")) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 1d5da3139de9..e4a51b369737 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -43,6 +43,9 @@ def __init__(self, config: ModernBertConfig): eps=config.layer_norm_eps, bias=config.norm_bias) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.tok_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -220,6 +223,9 @@ def __init__( eps=config.norm_eps, bias=config.norm_bias) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embeddings.get_input_embeddings(input_ids) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) @@ -333,6 +339,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ), }) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 3e4c580a1121..6a59c829875d 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -296,6 +296,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], self.config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -409,6 +412,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ba405be41687..53e698c4fa80 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -218,6 +218,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.jina_to_vllm_mapper) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.roberta.get_input_embeddings(input_ids) + def forward( self, input_ids: Optional[torch.Tensor], diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 94c862258b7a..7cd5f908487d 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -484,6 +484,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, From 38058d14ef55b8408d9e976bef21dbdedae1acae Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 08:57:11 +0000 Subject: [PATCH 33/45] Fix Signed-off-by: DarkLight1337 --- vllm/model_executor/models/bert.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 3517d54a56b3..c984845204c4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -594,6 +594,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ), }) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.bert.get_input_embeddings(input_ids) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loaded_params = loader.load_weights(weights) @@ -643,6 +646,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): Pooler.for_encode(pooler_config), }) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.bert.get_input_embeddings(input_ids) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loaded_params = loader.load_weights(weights) From a71a832257aa7acda95160327fdaecee597829cd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 09:56:12 +0000 Subject: [PATCH 34/45] Fix Signed-off-by: DarkLight1337 --- vllm/model_executor/models/step3_text.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index b8733fa5e612..905f2f7139e2 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -398,6 +398,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, From 7d8f58dcf772e6edbd60c3b71cac948144dd682a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 20 Sep 2025 12:54:46 +0000 Subject: [PATCH 35/45] Fix V0 Signed-off-by: DarkLight1337 --- vllm/model_executor/models/utils.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index d545756b9994..ba6f04642fad 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -465,16 +465,14 @@ def merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if isinstance(placeholder_token_id, list): - return _merge_multimodal_embeddings( - inputs_embeds, - isin_list(input_ids, placeholder_token_id), - multimodal_embeddings, - ) + is_multimodal = isin_list(input_ids, placeholder_token_id) + else: + is_multimodal = (input_ids == placeholder_token_id) return _merge_multimodal_embeddings( inputs_embeds, - (input_ids == placeholder_token_id), - multimodal_embeddings, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, ) From e33a1957300a070f25590a7ed7d78aa7dd611f45 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 21 Sep 2025 04:35:28 +0000 Subject: [PATCH 36/45] Rename `do_language_embed_multimodal -> handle_oov_mm_token` Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ernie45_vl.py | 4 ++-- vllm/model_executor/models/gemma3n_mm.py | 4 ++-- vllm/model_executor/models/granite_speech.py | 4 ++-- vllm/model_executor/models/interfaces.py | 12 ++++++------ vllm/model_executor/models/interns1.py | 4 ++-- vllm/model_executor/models/internvl.py | 4 ++-- vllm/model_executor/models/llava_next.py | 4 ++-- vllm/model_executor/models/nemotron_vl.py | 4 ++-- vllm/model_executor/models/phi3v.py | 4 ++-- vllm/model_executor/models/qwen2_5_omni_thinker.py | 4 ++-- vllm/model_executor/models/qwen3_vl.py | 4 ++-- vllm/model_executor/models/skyworkr1v.py | 4 ++-- vllm/model_executor/models/step3_vl.py | 4 ++-- vllm/model_executor/models/terratorch.py | 2 +- vllm/model_executor/models/transformers.py | 6 +++--- vllm/model_executor/models/ultravox.py | 4 ++-- vllm/model_executor/models/whisper.py | 2 +- 17 files changed, 37 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 485eaa2c6903..42c619f10cd4 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1468,7 +1468,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: @@ -1482,7 +1482,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index f317b6c452c3..c1c8ebf292c0 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -635,7 +635,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache # them here, as the model forward has only access to the input_embeds. @@ -656,7 +656,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward(self, diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index ff4240c14047..1309ce4f0f05 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -742,7 +742,7 @@ def get_input_embeddings( *, is_multimodal: Optional[torch.Tensor] = None, # Multi-modal token ID may exceed vocab size - do_language_embed_multimodal: bool = False, + handle_oov_mm_token: bool = True, ) -> torch.Tensor: # This is to satisfy the type checker for each overload if multimodal_embeddings is None or is_multimodal is None: @@ -752,7 +752,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index edf752280f2e..b59a9d9148d8 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -107,7 +107,7 @@ def get_input_embeddings( multimodal_embeddings: MultiModalEmbeddings, *, is_multimodal: torch.Tensor, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> Tensor: ... @@ -117,9 +117,9 @@ def _get_text_embeddings( get_input_embeddings: Callable[[Tensor], Tensor], *, is_multimodal: Optional[Tensor], - do_language_embed_multimodal: bool, + handle_oov_mm_token: bool, ) -> Tensor: - if not do_language_embed_multimodal and is_multimodal is not None: + if not handle_oov_mm_token and is_multimodal is not None: is_text = ~is_multimodal text_embeds = get_input_embeddings(input_ids[is_text]) @@ -137,7 +137,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> Tensor: """ Apply token embeddings to `input_ids`. @@ -146,7 +146,7 @@ def get_input_embeddings( `input_ids` according to the mask `is_multimodal`. In case the multi-modal token IDs exceed the vocabulary size of - the language model, you can set `do_language_embed_multimodal=False` + the language model, you can set `handle_oov_mm_token=False` to avoid calling the language model's `get_input_embeddings` method on those tokens. """ @@ -156,7 +156,7 @@ def get_input_embeddings( input_ids, self.get_language_model().get_input_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) if multimodal_embeddings is None: diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index ba3ae350c4b4..0a79d966fd2b 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -760,7 +760,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: @@ -774,7 +774,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 2d1b5ba43d53..39cca6c3ea96 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1342,7 +1342,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: @@ -1356,7 +1356,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 90ece745dd74..5551fd7fd6ee 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -478,7 +478,7 @@ def get_input_embeddings( *, is_multimodal: Optional[torch.Tensor] = None, # Multi-modal token ID may exceed vocab size - do_language_embed_multimodal: bool = False, + handle_oov_mm_token: bool = True, ) -> torch.Tensor: # This is to satisfy the type checker for each overload if multimodal_embeddings is None or is_multimodal is None: @@ -488,7 +488,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index c08204f75187..e1d169a01119 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -579,7 +579,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: @@ -593,7 +593,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 0b547eccd378..2ad9cda3ce12 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -646,13 +646,13 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: inputs_embeds = self._get_text_embeddings( input_ids, self.embed_tokens, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) if multimodal_embeddings is None: diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 351154287dab..a7565a6cf943 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -874,7 +874,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: # This is to satisfy the type checker for each overload if multimodal_embeddings is None or is_multimodal is None: @@ -884,7 +884,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def get_multimodal_embeddings_v0( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c29da1103912..301e1dbd1f43 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1350,13 +1350,13 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: inputs_embeds = self._get_text_embeddings( input_ids, self.language_model.get_input_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) if multimodal_embeddings is None: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index bac98d4b9e3a..68523f3dc48e 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -845,7 +845,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: if multimodal_embeddings is not None and len( multimodal_embeddings) > 0: @@ -859,7 +859,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 9445e3b6dc53..0b8f5817f652 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -1022,7 +1022,7 @@ def get_input_embeddings( *, is_multimodal: Optional[torch.Tensor] = None, # Multi-modal token ID may exceed vocab size - do_language_embed_multimodal: bool = False, + handle_oov_mm_token: bool = True, ) -> torch.Tensor: # This is to satisfy the type checker for each overload if multimodal_embeddings is None or is_multimodal is None: @@ -1032,7 +1032,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward( diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index dcd71a94f5c9..938b02e3e04b 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -235,7 +235,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: # We do not really use any input tokens and therefore no embeddings # to be calculated. However, due to the mandatory token ids in diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index fc2f2e5068cc..8446897f5e2d 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -946,7 +946,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: """ Apply token embeddings to `input_ids`. @@ -955,7 +955,7 @@ def get_input_embeddings( `input_ids` according to the mask `is_multimodal`. In case the multi-modal token IDs exceed the vocabulary size of - the language model, you can set `do_language_embed_multimodal=False` + the language model, you can set `handle_oov_mm_token=False` to avoid calling the language model's `get_input_embeddings` method on those tokens. """ @@ -965,7 +965,7 @@ def get_input_embeddings( input_ids, self.model.get_input_embeddings(), is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) if multimodal_embeddings is None: diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 1eeb3fc5332d..3afa65080f2d 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -558,7 +558,7 @@ def get_input_embeddings( *, is_multimodal: Optional[torch.Tensor] = None, # Multi-modal token ID may exceed vocab size - do_language_embed_multimodal: bool = False, + handle_oov_mm_token: bool = True, ) -> torch.Tensor: # This is to satisfy the type checker for each overload if multimodal_embeddings is None or is_multimodal is None: @@ -568,7 +568,7 @@ def get_input_embeddings( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - do_language_embed_multimodal=do_language_embed_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def forward(self, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 7b559041071f..0e6849b0141a 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -917,7 +917,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[MultiModalEmbeddings] = None, *, is_multimodal: Optional[torch.Tensor] = None, - do_language_embed_multimodal: bool = True, + handle_oov_mm_token: bool = False, ) -> torch.Tensor: # This method just returns the decoder sequence embeddings since # Whisper does not have encoder text tokens. From ead536d11060f4000a164ae34d206ad48c24913d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 21 Sep 2025 04:36:51 +0000 Subject: [PATCH 37/45] Update docstring Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b59a9d9148d8..c1ddabf0e7cc 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -148,7 +148,8 @@ def get_input_embeddings( In case the multi-modal token IDs exceed the vocabulary size of the language model, you can set `handle_oov_mm_token=False` to avoid calling the language model's `get_input_embeddings` method - on those tokens. + on those tokens. Note however that doing so increases memory usage + as an additional buffer is needed to hold the input embeddings. """ from .utils import _merge_multimodal_embeddings From 6db35c3c22c64d61e5e3393931a102140690fdaa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 21 Sep 2025 04:38:56 +0000 Subject: [PATCH 38/45] Add doc Signed-off-by: DarkLight1337 --- docs/contributing/model/multimodal.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 760fb05b8d10..61840b6c24ea 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -66,6 +66,13 @@ Further update the model as follows: !!! important The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. +!!! note + By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in + [PlaceholderRange][vllm.inputs.multimodal.PlaceholderRange] from input processing. + This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings]. + + You may override this method if additional logic is required for your model when merging embeddings. + - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. ```python From d13fca8b674bcd7542c805e32946dfd9d4276cc0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 22 Sep 2025 03:50:11 +0000 Subject: [PATCH 39/45] Update DotsOCR Signed-off-by: DarkLight1337 --- vllm/model_executor/models/dots_ocr.py | 44 +++++++------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 04fa5584199a..c9a946b239fb 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -28,8 +28,7 @@ Qwen2VLProcessingInfo) from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, - maybe_prefix, - merge_multimodal_embeddings) + maybe_prefix) from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalDataDict @@ -757,33 +756,17 @@ def _process_image_input( def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return [] vision_embeddings = self._process_image_input(image_input) return vision_embeddings - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - self.config.image_token_id, - ) - - return inputs_embeds - def forward( self, - input_ids: Optional[torch.Tensor], + input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -791,17 +774,14 @@ def forward( ) -> Union[torch.Tensor, IntermediateTensors]: if intermediate_tensors is not None: inputs_embeds = None - elif inputs_embeds is None and kwargs.get("pixel_values") is not None: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - inputs_embeds = None - else: - assert input_ids is not None - inputs_embeds = self.get_multimodal_embeddings( - input_ids, - image_input=image_input, - ) - input_ids = None + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=input_ids == self.config.image_token_id, + ) + input_ids = None hidden_states = self.language_model( input_ids=input_ids, From beb9df06598e5296ff5fdb8792ae79fcc7ad7391 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 22 Sep 2025 06:26:54 +0000 Subject: [PATCH 40/45] Fix wrong condition Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index c1ddabf0e7cc..241de23aa36a 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -119,7 +119,7 @@ def _get_text_embeddings( is_multimodal: Optional[Tensor], handle_oov_mm_token: bool, ) -> Tensor: - if not handle_oov_mm_token and is_multimodal is not None: + if handle_oov_mm_token and is_multimodal is not None: is_text = ~is_multimodal text_embeds = get_input_embeddings(input_ids[is_text]) From 8a6fb1ba8e251cd89ca394c2061a5c70e4dfe0ef Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 22 Sep 2025 00:09:36 -0700 Subject: [PATCH 41/45] fix qwen3-vl Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 07f4e490e9ff..e1f1e2e55e25 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1357,7 +1357,7 @@ def get_input_embeddings( handle_oov_mm_token=handle_oov_mm_token, ) - if multimodal_embeddings is None: + if not multimodal_embeddings: return inputs_embeds if is_multimodal is None: From 2eefc2d4e85ebb8c9a148268686806e6d4a927eb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 22 Sep 2025 07:13:13 +0000 Subject: [PATCH 42/45] Fix wrong condition Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 2 +- vllm/model_executor/models/phi3v.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/model_executor/models/transformers.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 241de23aa36a..19eded0e6e26 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -160,7 +160,7 @@ def get_input_embeddings( handle_oov_mm_token=handle_oov_mm_token, ) - if multimodal_embeddings is None: + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds if is_multimodal is None: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 3f767f8613b2..ea34c8d92f13 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -654,7 +654,7 @@ def get_input_embeddings( handle_oov_mm_token=handle_oov_mm_token, ) - if multimodal_embeddings is None: + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds if is_multimodal is None: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index e1f1e2e55e25..f37514881f2e 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1357,7 +1357,7 @@ def get_input_embeddings( handle_oov_mm_token=handle_oov_mm_token, ) - if not multimodal_embeddings: + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds if is_multimodal is None: diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index cce9c15087a2..e12720bea6be 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -965,7 +965,7 @@ def get_input_embeddings( handle_oov_mm_token=handle_oov_mm_token, ) - if multimodal_embeddings is None: + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds if is_multimodal is None: From aa6703304641166e79512e174339da17f48f3aa2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 24 Sep 2025 08:53:29 +0000 Subject: [PATCH 43/45] Reduce diff Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index d2af4adc3488..d4f0a65f2a16 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -161,7 +161,6 @@ def init_device(self): per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH, f"tp{world_size}_rank{rank}") xr.initialize_cache(per_rank_path, readonly=False) - logger.debug("XLA cache initialized at %s", per_rank_path) # Init ModelRunner here, so that we have access to self.device. self.model_runner = \ From 9260170896378eda02bd3ffbbced59807adaae4b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 26 Sep 2025 09:06:20 +0000 Subject: [PATCH 44/45] Simplify Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index be1f8e769f2b..cb940ac51f89 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1687,8 +1687,7 @@ def _gather_mm_embeddings( if should_sync_mrope_positions: self._calc_mrope_positions(scheduler_output) - self.mrope_positions.copy_to_gpu( - scheduler_output.total_num_scheduled_tokens) + self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens) return mm_embeds, is_mm_embed From 2ac91b6b341d7acb6b2fd623a748c39fe3811ee3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 26 Sep 2025 09:30:57 +0000 Subject: [PATCH 45/45] Fix doc Signed-off-by: DarkLight1337 --- docs/contributing/model/multimodal.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 61840b6c24ea..1d72fe97b966 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -68,7 +68,7 @@ Further update the model as follows: !!! note By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in - [PlaceholderRange][vllm.inputs.multimodal.PlaceholderRange] from input processing. + [PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing. This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings]. You may override this method if additional logic is required for your model when merging embeddings.