[Bugfix] Fix error with penalties when speculative decoding and structural output are enabled (vllm-project#26586)

southfreebird · ilmarkov · commit ad28c6e0f688 · 2025-11-07T09:29:56.000Z
Signed-off-by: southfreebird &lt;yvorott@gmail.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -780,7 +780,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, ()
+                req_id, []
             )
             if spec_token_ids:
                 num_spec_tokens = len(spec_token_ids)
@@ -791,7 +791,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 ] = spec_token_ids
                 # NOTE(woosuk): `num_tokens` here may include spec tokens.
                 self.input_batch.num_tokens[req_index] += num_spec_tokens
-                self.input_batch.spec_token_ids[req_index] = spec_token_ids
+
+            # When speculative decoding is used with structured output,
+            # the scheduler can drop draft tokens that do not
+            # conform to the schema. This can result in
+            # scheduler_output.scheduled_spec_decode_tokens being empty,
+            # even when speculative decoding is enabled.
+            self.input_batch.spec_token_ids[req_index] = spec_token_ids
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.