Skip to content

Commit 8cba549

Browse files
committed
readability/simplification updates
Signed-off-by: Nick Hill <[email protected]>
1 parent e5f9634 commit 8cba549

File tree

3 files changed

+19
-16
lines changed

3 files changed

+19
-16
lines changed

vllm/v1/core/sched/async_scheduler.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ def _update_after_schedule(
1515
scheduler_output: SchedulerOutput,
1616
) -> None:
1717
super()._update_after_schedule(scheduler_output)
18-
needs_structured_output_tokens = False
18+
pending_structured_output_tokens = False
1919
for req_id in scheduler_output.num_scheduled_tokens:
2020
request = self.requests[req_id]
21-
needs_structured_output_tokens |= (
21+
pending_structured_output_tokens |= (
2222
request.use_structured_output and request.num_output_placeholders > 0
2323
)
2424
if (
@@ -29,7 +29,9 @@ def _update_after_schedule(
2929
# TODO(woosuk): Support speculative decoding.
3030
request.num_output_placeholders += 1
3131

32-
scheduler_output.needs_structured_output_tokens = needs_structured_output_tokens
32+
scheduler_output.pending_structured_output_tokens = (
33+
pending_structured_output_tokens
34+
)
3335

3436
def _update_request_with_output(
3537
self,

vllm/v1/core/sched/output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ class SchedulerOutput:
167167

168168
# Whether the scheduled requests have all the output tokens they
169169
# need to perform grammar bitmask computation.
170-
needs_structured_output_tokens: bool = False
170+
pending_structured_output_tokens: bool = False
171171

172172
# KV Cache Connector metadata.
173173
kv_connector_metadata: KVConnectorMetadata | None = None

vllm/v1/engine/core.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,9 @@ def step_with_batch_queue(
373373
)
374374
assert isinstance(exec_future, Future)
375375

376-
if scheduler_output.needs_structured_output_tokens:
376+
if scheduler_output.pending_structured_output_tokens:
377+
# We need to defer sampling until we have processed the model output
378+
# from the prior step.
377379
deferred_scheduler_output = scheduler_output
378380
grammar_output = None
379381
else:
@@ -383,20 +385,18 @@ def step_with_batch_queue(
383385
# Block-wait for execute to return (continues running async on the GPU).
384386
model_executed = scheduler_output.total_num_scheduled_tokens > 0
385387
with self.log_error_detail(scheduler_output):
386-
model_output = exec_future.result()
388+
model_output_or_none = exec_future.result()
387389

388-
if deferred_scheduler_output:
389-
assert model_output is None
390-
else:
391-
if model_output is not None:
392-
# No sampling required (e.g. all requests finished).
393-
future = cast(Future[ModelRunnerOutput], exec_future)
394-
else:
395-
# No pending output tokens needed, sample immediately.
390+
if not deferred_scheduler_output:
391+
if model_output_or_none is None:
392+
# No pending output tokens needed here, sample immediately.
396393
sample_future = self.model_executor.sample_tokens(
397394
grammar_output, non_block=True
398395
)
399396
future = cast(Future[ModelRunnerOutput], sample_future)
397+
else:
398+
# No sampling required (e.g. all requests finished).
399+
future = cast(Future[ModelRunnerOutput], exec_future)
400400
batch_queue.appendleft((future, scheduler_output))
401401
if (
402402
model_executed
@@ -406,6 +406,8 @@ def step_with_batch_queue(
406406
# Don't block on next worker response unless the queue is full
407407
# or there are no more requests to schedule.
408408
return None, True
409+
else:
410+
assert model_output_or_none is None
409411

410412
elif not batch_queue:
411413
# Queue is empty. We should not reach here since this method should
@@ -417,13 +419,12 @@ def step_with_batch_queue(
417419
future, scheduler_output = batch_queue.pop()
418420
with self.log_error_detail(scheduler_output):
419421
model_output = future.result()
420-
assert model_output is not None
422+
421423
engine_core_outputs = self.scheduler.update_from_output(
422424
scheduler_output, model_output
423425
)
424426

425427
# TODO TBD return outputs here first?
426-
427428
if deferred_scheduler_output:
428429
# We now have the tokens needed to compute the bitmask for the
429430
# deferred request. Get the bitmask and dispatch sample request.

0 commit comments

Comments
 (0)