Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,12 @@ class SchedulerConfig:
like full attention and sliding window attention.
"""

async_scheduling: bool = False
"""If set to True, perform async scheduling. This helps to avoid gaps in
GPU utilization, leading to better latency and throughput.
Async scheduling is currently not supported with some features such as
speculative decoding and pipeline parallelism.
async_scheduling: bool = Field(default=None)
"""If set to False, disable async scheduling. Async scheduling helps to
avoid gaps in GPU utilization, leading to better latency and throughput.
It is currently not supported with some features such as
speculative decoding and pipeline parallelism, and will be automatically
disabled in those cases.
"""

stream_interval: int = Field(default=1, ge=1)
Expand Down
32 changes: 24 additions & 8 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def __post_init__(self):
if self.speculative_config.method not in get_args(EagleModelTypes):
raise ValueError(
"Currently, async scheduling is only supported "
"with EAGLE/MTP kind of speculative decoding"
"with EAGLE/MTP kind of speculative decoding."
)
if self.speculative_config.disable_padded_drafter_batch:
raise ValueError(
Expand All @@ -399,16 +399,27 @@ def __post_init__(self):
)
elif self.scheduler_config.async_scheduling is None:
# Enable async scheduling unless there is an incompatible option.
# NOTE: we won't reach here until async scheduling is enabled by default.
if (
self.parallel_config.pipeline_parallel_size > 1
or self.speculative_config is not None
):
if self.parallel_config.pipeline_parallel_size > 1:
logger.warning(
"Async scheduling is not yet supported with speculative decoding "
" or pipeline_parallel_size > 1 and will be disabled."
"Async scheduling is not yet supported with "
"pipeline_parallel_size > 1 and will be disabled."
)
self.scheduler_config.async_scheduling = False
elif self.speculative_config is not None:
if self.speculative_config.method not in get_args(EagleModelTypes):
logger.warning(
"Async scheduling not supported with %s-based "
"speculative decoding and will be disabled.",
self.speculative_config.method,
)
else:
logger.warning(
"Async scheduling will be disabled because some features do "
"not currently work in conjunction with speculative decoding. "
"To use async scheduling with spec decoding anyway, "
"enable it explicitly via async_scheduling=True."
)
self.scheduler_config.async_scheduling = False
elif not executor_supports_async_sched:
logger.warning(
"Async scheduling will be disabled because it is not supported "
Expand All @@ -420,6 +431,11 @@ def __post_init__(self):
else:
self.scheduler_config.async_scheduling = True

logger.info_once(
"Asynchronous scheduling is %s.",
"enabled" if self.scheduler_config.async_scheduling else "disabled",
)

from vllm.platforms import current_platform

if (
Expand Down