Skip to content
13 changes: 13 additions & 0 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2406,6 +2406,13 @@ def _model_forward(
**model_kwargs,
)

def _has_prefill_tokens_scheduled(self, scheduler_output: "SchedulerOutput",
num_scheduled_tokens: np.ndarray,
num_reqs: int) -> bool:
prompt_lens = self.input_batch.num_prompt_tokens[:num_reqs]
num_computed = self.input_batch.num_computed_tokens_cpu[:num_reqs]
return np.any((num_scheduled_tokens > 0) & (num_computed < prompt_lens))

@torch.inference_mode()
def execute_model(
self,
Expand Down Expand Up @@ -2470,6 +2477,12 @@ def execute_model(
uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
num_scheduled_tokens == self.input_batch.num_reqs * max_query_len
)
# Disable uniform decode on steps that still process prompt tokens.
# This makes first-step behavior consistent regardless of prompt length.
if self._has_prefill_tokens_scheduled(scheduler_output,
num_scheduled_tokens,
self.input_batch.num_reqs):
uniform_decode = False
batch_descriptor = BatchDescriptor(
num_tokens=num_input_tokens, uniform_decode=uniform_decode
)
Expand Down
Loading