diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b14b6b1c3f52..893b8d93e1ee 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2508,6 +2508,15 @@ def _model_forward( **model_kwargs, ) + def _has_prefill_tokens_scheduled( + self, + num_scheduled_tokens: np.ndarray, + num_reqs: int, + ) -> bool: + prompt_lens = self.input_batch.num_prompt_tokens[:num_reqs] + num_computed = self.input_batch.num_computed_tokens_cpu[:num_reqs] + return np.any((num_scheduled_tokens > 0) & (num_computed < prompt_lens)) + @torch.inference_mode() def execute_model( self, @@ -2616,6 +2625,12 @@ def execute_model( uniform_decode = ( max_num_scheduled_tokens == self.uniform_decode_query_len ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) + # Disable uniform decode on steps that still process prompt tokens. + # This makes first-step behavior consistent regardless of prompt length. + if self._has_prefill_tokens_scheduled( + num_scheduled_tokens_np, self.input_batch.num_reqs + ): + uniform_decode = False batch_descriptor = BatchDescriptor( num_tokens=num_input_tokens, uniform_decode=uniform_decode,