From b3e5dc63114add1091f05216f1b8bd64ac741677 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Thu, 25 Sep 2025 23:12:14 -0500 Subject: [PATCH 1/2] perf: Avoid copying inputs_embeds tensors to GPU unless prompt_embeds is enabled Signed-off-by: Andrew Sansom --- vllm/v1/worker/gpu_model_runner.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index dca6feded12e..a4aba94f0ae8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -836,8 +836,9 @@ def _prepare_input_ids(self, total_num_scheduled_tokens: int, if self.input_batch.prev_sampled_token_ids is None: # Normal scheduling case self.input_ids.copy_to_gpu(total_num_scheduled_tokens) - self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) - self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) + if self.enable_prompt_embeds: + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) return # Async scheduling case, where some decode requests from the previous @@ -863,8 +864,9 @@ def _prepare_input_ids(self, total_num_scheduled_tokens: int, # If not all requests are decodes from the last iteration, # We need to copy the input_ids_cpu to the GPU first. self.input_ids.copy_to_gpu(total_num_scheduled_tokens) - self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) - self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) + if self.enable_prompt_embeds: + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) if num_commmon_tokens == 0: # No requests in common with the previous iteration # So input_ids_cpu will have all the input ids. @@ -878,7 +880,8 @@ def _prepare_input_ids(self, total_num_scheduled_tokens: int, self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0], non_blocking=True) - self.is_token_ids.gpu[:num_commmon_tokens] = True + if self.enable_prompt_embeds: + self.is_token_ids.gpu[:num_commmon_tokens] = True return # Upload the index tensors asynchronously # so the scatter can be non-blocking. From af40fa6631f402ddcaf1d7bd4fca02adc7e3b662 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Fri, 26 Sep 2025 00:01:52 -0500 Subject: [PATCH 2/2] fix: do not copy token ids from input batch to CPU tensor unless prompt embeds is enabled Signed-off-by: Andrew Sansom --- vllm/v1/worker/gpu_model_runner.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a4aba94f0ae8..a1969463cbfb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -981,12 +981,13 @@ def _prepare_inputs( 0, token_indices_tensor, out=self.input_ids.cpu[:total_num_scheduled_tokens]) - is_token_ids = self.input_batch.is_token_ids.flatten() - torch.index_select( - is_token_ids, - 0, - token_indices_tensor, - out=self.is_token_ids.cpu[:total_num_scheduled_tokens]) + if self.enable_prompt_embeds: + is_token_ids = self.input_batch.is_token_ids.flatten() + torch.index_select( + is_token_ids, + 0, + token_indices_tensor, + out=self.is_token_ids.cpu[:total_num_scheduled_tokens]) # Because we did not pre-allocate a massive prompt_embeds CPU tensor on # the InputBatch, we need to fill in the prompt embeds into the expected