Skip to content

Commit ad28c6e

Browse files
southfreebirdilmarkov
authored andcommitted
[Bugfix] Fix error with penalties when speculative decoding and structural output are enabled (vllm-project#26586)
Signed-off-by: southfreebird <[email protected]>
1 parent 096ac86 commit ad28c6e

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
780780

781781
# Add spec_token_ids to token_ids_cpu.
782782
spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
783-
req_id, ()
783+
req_id, []
784784
)
785785
if spec_token_ids:
786786
num_spec_tokens = len(spec_token_ids)
@@ -791,7 +791,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
791791
] = spec_token_ids
792792
# NOTE(woosuk): `num_tokens` here may include spec tokens.
793793
self.input_batch.num_tokens[req_index] += num_spec_tokens
794-
self.input_batch.spec_token_ids[req_index] = spec_token_ids
794+
795+
# When speculative decoding is used with structured output,
796+
# the scheduler can drop draft tokens that do not
797+
# conform to the schema. This can result in
798+
# scheduler_output.scheduled_spec_decode_tokens being empty,
799+
# even when speculative decoding is enabled.
800+
self.input_batch.spec_token_ids[req_index] = spec_token_ids
795801

796802
# Add the new or resumed requests to the persistent batch.
797803
# The smaller empty indices are filled first.

0 commit comments

Comments
 (0)