@@ -61,6 +61,7 @@ def __init__(
6161 self .kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE [
6262 cache_config .cache_dtype ]
6363
64+ self .is_multimodal_model = model_config .is_multimodal_model
6465 self .sliding_window = model_config .get_sliding_window ()
6566 self .block_size = cache_config .block_size
6667 self .max_model_len = model_config .max_model_len
@@ -103,6 +104,11 @@ def __init__(
103104 # The batch sizes in the config are in descending order.
104105 self .cudagraph_batch_sizes = list (
105106 reversed (self .vllm_config .compilation_config .capture_sizes ))
107+
108+ # Persistent buffers for CUDA graphs.
109+ self .input_ids = torch .zeros (self .max_num_tokens ,
110+ dtype = torch .int32 ,
111+ device = self .device )
106112 self .positions = torch .zeros (self .max_num_tokens ,
107113 dtype = torch .int64 ,
108114 device = self .device )
@@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
310316 seq_start_loc_np [0 ] = 0
311317 np .cumsum (seq_lens , out = seq_start_loc_np [1 :])
312318
313- input_ids = input_ids .to (self .device , non_blocking = True )
319+ self .input_ids [:total_num_scheduled_tokens ].copy_ (input_ids ,
320+ non_blocking = True )
314321 self .positions [:total_num_scheduled_tokens ].copy_ (positions ,
315322 non_blocking = True )
316323 query_start_loc = query_start_loc .to (self .device , non_blocking = True )
@@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
331338 # token from the partial request.
332339 # TODO: Support prompt logprobs.
333340 logits_indices = query_start_loc [1 :] - 1
334- return input_ids , attn_metadata , logits_indices
341+ return attn_metadata , logits_indices
335342
336343 def _prepare_sampling (
337344 self ,
@@ -427,13 +434,15 @@ def execute_model(
427434 ) -> ModelRunnerOutput :
428435 self ._update_states (scheduler_output )
429436
430- # Run the encoder.
431- self ._execute_encoder (scheduler_output )
432- encoder_outputs = self ._gather_encoder_outputs (scheduler_output )
437+ if self .is_multimodal_model :
438+ # Run the multimodal encoder if any.
439+ self ._execute_encoder (scheduler_output )
440+ encoder_outputs = self ._gather_encoder_outputs (scheduler_output )
441+ else :
442+ encoder_outputs = []
433443
434444 # Prepare the decoder inputs.
435- input_ids , attn_metadata , logits_indices = self ._prepare_inputs (
436- scheduler_output )
445+ attn_metadata , logits_indices = self ._prepare_inputs (scheduler_output )
437446 num_scheduled_tokens = scheduler_output .total_num_scheduled_tokens
438447 if (self .use_cuda_graph
439448 and num_scheduled_tokens <= self .cudagraph_batch_sizes [- 1 ]):
@@ -444,29 +453,39 @@ def execute_model(
444453 else :
445454 # Eager mode.
446455 num_input_tokens = num_scheduled_tokens
447-
448456 attn_metadata .num_input_tokens = num_input_tokens
449457
450- # Get the inputs embeds.
451- if encoder_outputs :
452- inputs_embeds = self .model .get_input_embeddings (
453- input_ids , encoder_outputs )
458+ if self .is_multimodal_model :
459+ # NOTE(woosuk): To unify token ids and soft tokens (vision
460+ # embeddings), we always use embeddings (rather than token ids)
461+ # as input to the multimodal model, even when the input is text.
462+ input_ids = self .input_ids [:num_scheduled_tokens ]
463+ if encoder_outputs :
464+ inputs_embeds = self .model .get_input_embeddings (
465+ input_ids , encoder_outputs )
466+ else :
467+ inputs_embeds = self .model .get_input_embeddings (input_ids )
468+ # TODO(woosuk): Avoid the copy. Optimize.
469+ self .inputs_embeds [:num_scheduled_tokens ].copy_ (inputs_embeds )
470+ inputs_embeds = self .inputs_embeds [:num_input_tokens ]
471+ input_ids = None
454472 else :
455- inputs_embeds = self .model .get_input_embeddings (input_ids )
456- # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
457- # always use embeddings (rather than token ids) as input to the model.
458- # TODO(woosuk): Avoid the copy. Optimize.
459- self .inputs_embeds [:num_scheduled_tokens ].copy_ (inputs_embeds )
473+ # For text-only models, we use token ids as input.
474+ # While it is possible to use embeddings as input just like the
475+ # multimodal models, it is not desirable for performance since
476+ # then the embedding layer is not included in the CUDA graph.
477+ input_ids = self .input_ids [:num_input_tokens ]
478+ inputs_embeds = None
460479
461480 # Run the decoder.
462481 # Use persistent buffers for CUDA graphs.
463482 with set_forward_context (attn_metadata , self .vllm_config ):
464483 hidden_states = self .model (
465- input_ids = None ,
484+ input_ids = input_ids ,
466485 positions = self .positions [:num_input_tokens ],
467486 kv_caches = self .kv_caches ,
468487 attn_metadata = None ,
469- inputs_embeds = self . inputs_embeds [: num_input_tokens ] ,
488+ inputs_embeds = inputs_embeds ,
470489 )
471490 hidden_states = hidden_states [:num_scheduled_tokens ]
472491 hidden_states = hidden_states [logits_indices ]
@@ -534,13 +553,20 @@ def _dummy_run(
534553 num_tokens : int ,
535554 kv_caches : List [torch .Tensor ],
536555 ) -> torch .Tensor :
556+ if self .is_multimodal_model :
557+ input_ids = None
558+ inputs_embeds = self .inputs_embeds [:num_tokens ]
559+ else :
560+ input_ids = self .input_ids [:num_tokens ]
561+ inputs_embeds = None
537562 with set_forward_context (None , self .vllm_config ):
538563 hidden_states = model (
539- input_ids = None ,
564+ input_ids = input_ids ,
540565 positions = self .positions [:num_tokens ],
541566 kv_caches = kv_caches ,
542567 attn_metadata = None ,
543- inputs_embeds = self .inputs_embeds [:num_tokens ])
568+ inputs_embeds = inputs_embeds ,
569+ )
544570 return hidden_states
545571
546572 def profile_run (self ) -> None :
0 commit comments