Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,8 +836,9 @@ def _prepare_input_ids(self, total_num_scheduled_tokens: int,
if self.input_batch.prev_sampled_token_ids is None:
# Normal scheduling case
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
return

# Async scheduling case, where some decode requests from the previous
Expand All @@ -863,8 +864,9 @@ def _prepare_input_ids(self, total_num_scheduled_tokens: int,
# If not all requests are decodes from the last iteration,
# We need to copy the input_ids_cpu to the GPU first.
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0:
# No requests in common with the previous iteration
# So input_ids_cpu will have all the input ids.
Expand All @@ -878,7 +880,8 @@ def _prepare_input_ids(self, total_num_scheduled_tokens: int,
self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
0],
non_blocking=True)
self.is_token_ids.gpu[:num_commmon_tokens] = True
if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True
Comment on lines +883 to +884
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This change correctly handles is_token_ids for the fast path in async scheduling. However, a similar update is missing for the slow path (the scatter_ case for input_ids around line 900). For correctness when prompt embeddings are enabled, is_token_ids for the scattered tokens should also be set to True, as they correspond to sampled token IDs. A complete fix would handle both paths.

return
# Upload the index tensors asynchronously
# so the scatter can be non-blocking.
Expand Down Expand Up @@ -978,12 +981,13 @@ def _prepare_inputs(
0,
token_indices_tensor,
out=self.input_ids.cpu[:total_num_scheduled_tokens])
is_token_ids = self.input_batch.is_token_ids.flatten()
torch.index_select(
is_token_ids,
0,
token_indices_tensor,
out=self.is_token_ids.cpu[:total_num_scheduled_tokens])
if self.enable_prompt_embeds:
is_token_ids = self.input_batch.is_token_ids.flatten()
torch.index_select(
is_token_ids,
0,
token_indices_tensor,
out=self.is_token_ids.cpu[:total_num_scheduled_tokens])

# Because we did not pre-allocate a massive prompt_embeds CPU tensor on
# the InputBatch, we need to fill in the prompt embeds into the expected
Expand Down