Skip to content

Commit b027710

Browse files
committed
[WIP] Enable async scheduling by default
Signed-off-by: Nick Hill <[email protected]>
1 parent 8964d5e commit b027710

File tree

6 files changed

+66
-16
lines changed

6 files changed

+66
-16
lines changed

tests/basic_correctness/test_basic_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def test_models(
102102
enforce_eager=enforce_eager,
103103
enable_prompt_embeds=enable_prompt_embeds,
104104
gpu_memory_utilization=0.7,
105-
async_scheduling=async_scheduling,
105+
disable_async_scheduling=not async_scheduling,
106106
distributed_executor_backend=model_executor,
107107
) as vllm_model:
108108
if enable_prompt_embeds:

tests/v1/distributed/test_async_llm_dp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def log_engine_initialized(self):
108108
prompt = "This is a test of data parallel"
109109

110110
engine_args.data_parallel_backend = data_parallel_backend
111-
engine_args.async_scheduling = async_scheduling
111+
engine_args.disable_async_scheduling = not async_scheduling
112112
engine = AsyncLLM.from_engine_args(
113113
engine_args, stat_loggers=[SimpleStatsLogger]
114114
)

tests/v1/e2e/test_async_scheduling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def test_preempt_and_async_scheduling_e2e(
6868
MODEL,
6969
max_model_len=512,
7070
enforce_eager=True,
71-
async_scheduling=async_scheduling,
71+
disable_async_scheduling=not async_scheduling,
7272
distributed_executor_backend=executor,
7373
dtype="float32", # avoid precision errors
7474
**cache_arg,

vllm/config/scheduler.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,12 @@ class SchedulerConfig:
130130
like full attention and sliding window attention.
131131
"""
132132

133-
async_scheduling: bool = False
134-
"""EXPERIMENTAL: If set to True, perform async scheduling. This may help
135-
reduce the CPU overheads, leading to better latency and throughput. However,
136-
async scheduling is currently not supported with some features such as
137-
structured outputs, speculative decoding, and pipeline parallelism.
133+
async_scheduling: bool = True
134+
"""If set to True, perform async scheduling. This helps to
135+
reduce the CPU overheads, leading to better latency and throughput.
136+
Async scheduling is currently not supported with some features such as
137+
speculative decoding and pipeline parallelism, and will be automatically
138+
disabled in those case.
138139
"""
139140

140141
def compute_hash(self) -> str:

vllm/config/vllm.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,17 @@ def __post_init__(self):
314314
self.model_config, self.load_config
315315
)
316316

317+
if self.scheduler_config.async_scheduling:
318+
if self.parallel_config.pipeline_parallel_size > 1:
319+
raise ValueError(
320+
"Async scheduling is not yet compatible with "
321+
"pipeline_parallel_size > 1."
322+
)
323+
if self.speculative_config is not None:
324+
raise ValueError(
325+
"Async scheduling is not yet compatible with speculative decoding."
326+
)
327+
317328
from vllm.platforms import current_platform
318329

319330
if (
@@ -440,7 +451,7 @@ def __post_init__(self):
440451
self.speculative_config is not None
441452
and self.speculative_config.use_eagle()
442453
):
443-
raise NotImplementedError(
454+
raise ValueError(
444455
"Fast prefill optimization for KV sharing is not "
445456
"compatible with EAGLE as EAGLE requires correct logits "
446457
"for all tokens while fast prefill gives incorrect logits "
@@ -464,7 +475,7 @@ def __post_init__(self):
464475
)
465476
if not getattr(self.model_config.hf_config, "is_causal", True):
466477
disable_chunked_prefill_reasons.append(
467-
"Only models using causal attention supports chunked "
478+
"Only models using causal attention support chunked "
468479
"prefill and prefix caching; disabling both."
469480
)
470481
elif self.model_config.is_encoder_decoder:

vllm/engine/arg_utils.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -549,8 +549,11 @@ class EngineArgs:
549549
)
550550
"""Custom logitproc types"""
551551

552+
# DEPRECATED
552553
async_scheduling: bool = SchedulerConfig.async_scheduling
553554

555+
disable_async_scheduling: bool = not SchedulerConfig.async_scheduling
556+
554557
kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
555558

556559
def __post_init__(self):
@@ -1041,6 +1044,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
10411044
scheduler_group.add_argument(
10421045
"--async-scheduling", **scheduler_kwargs["async_scheduling"]
10431046
)
1047+
scheduler_group.add_argument(
1048+
"--disable-async-scheduling",
1049+
action=argparse.BooleanOptionalAction,
1050+
help="If True, disable the use of asynchronous scheduling.",
1051+
)
10441052

10451053
# Compilation arguments
10461054
compilation_kwargs = get_kwargs(CompilationConfig)
@@ -1491,9 +1499,22 @@ def create_engine_config(
14911499
)
14921500

14931501
if self.async_scheduling:
1502+
# Async scheduling was explicitly enabled (deprecated)
1503+
if self.disable_async_scheduling:
1504+
raise ValueError(
1505+
"Cannot set both async_scheduling and disable_async_scheduling"
1506+
)
1507+
1508+
logger.warning(
1509+
"The async_scheduling arg is deprecated now that it is enabled "
1510+
"by default. Use disable_async_scheduling to disable it."
1511+
)
1512+
1513+
# Hard-fail compatibility checks if async scheduling
1514+
# was enabled *explicitly*.
14941515
if self.pipeline_parallel_size > 1:
14951516
raise ValueError(
1496-
"Async scheduling is not supported with pipeline-parallel-size > 1."
1517+
"Async scheduling is not supported with pipeline_parallel_size > 1."
14971518
)
14981519

14991520
# Currently, async scheduling does not support speculative decoding.
@@ -1504,6 +1525,16 @@ def create_engine_config(
15041525
"async scheduling."
15051526
)
15061527

1528+
if not self.disable_async_scheduling and (
1529+
self.pipeline_parallel_size > 1 or self.speculative_config is not None
1530+
):
1531+
logger.warning(
1532+
"Async scheduling is not yet supported with "
1533+
"speculative decoding or pipeline_parallel_size > 1 "
1534+
"and will be disabled."
1535+
)
1536+
self.disable_async_scheduling = True
1537+
15071538
# Forward the deprecated CLI args to the EPLB config.
15081539
if self.num_redundant_experts is not None:
15091540
self.eplb_config.num_redundant_experts = self.num_redundant_experts
@@ -1547,14 +1578,21 @@ def create_engine_config(
15471578
_api_process_rank=self._api_process_rank,
15481579
)
15491580

1550-
if self.async_scheduling and (
1551-
parallel_config.distributed_executor_backend not in ("mp", "uni")
1552-
):
1581+
executor_supports_async_sched = (
1582+
parallel_config.distributed_executor_backend in ("mp", "uni")
1583+
)
1584+
if self.async_scheduling and not executor_supports_async_sched:
15531585
raise ValueError(
15541586
"Currently, async scheduling only supports `mp` or `uni` "
1555-
"distributed executor backend, but you choose "
1587+
"distributed executor backend, but you chose "
15561588
f"`{parallel_config.distributed_executor_backend}`."
15571589
)
1590+
if not self.disable_async_scheduling and not executor_supports_async_sched:
1591+
logger.warning(
1592+
"Currently, async scheduling only supports `mp` or `uni` "
1593+
"distributed executor backend, not `%s`, and so will be disabled."
1594+
)
1595+
self.disable_async_scheduling = True
15581596

15591597
speculative_config = self.create_speculative_config(
15601598
target_model_config=model_config,
@@ -1585,7 +1623,7 @@ def create_engine_config(
15851623
max_long_partial_prefills=self.max_long_partial_prefills,
15861624
long_prefill_token_threshold=self.long_prefill_token_threshold,
15871625
disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
1588-
async_scheduling=self.async_scheduling,
1626+
async_scheduling=not self.disable_async_scheduling,
15891627
)
15901628

15911629
if not model_config.is_multimodal_model and self.default_mm_loras:

0 commit comments

Comments
 (0)