Skip to content
15 changes: 15 additions & 0 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2508,6 +2508,15 @@ def _model_forward(
**model_kwargs,
)

def _has_prefill_tokens_scheduled(
self,
num_scheduled_tokens: np.ndarray,
num_reqs: int,
) -> bool:
prompt_lens = self.input_batch.num_prompt_tokens[:num_reqs]
num_computed = self.input_batch.num_computed_tokens_cpu[:num_reqs]
return np.any((num_scheduled_tokens > 0) & (num_computed < prompt_lens))

@torch.inference_mode()
def execute_model(
self,
Expand Down Expand Up @@ -2616,6 +2625,12 @@ def execute_model(
uniform_decode = (
max_num_scheduled_tokens == self.uniform_decode_query_len
) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
# Disable uniform decode on steps that still process prompt tokens.
# This makes first-step behavior consistent regardless of prompt length.
if self._has_prefill_tokens_scheduled(
num_scheduled_tokens_np, self.input_batch.num_reqs
):
uniform_decode = False
batch_descriptor = BatchDescriptor(
num_tokens=num_input_tokens,
uniform_decode=uniform_decode,
Expand Down
Loading