[WIP] Enable async scheduling by default

njhill · njhill · commit b027710c4b0d · 2025-10-27T18:47:18.000-07:00
Signed-off-by: Nick Hill &lt;nhill@redhat.com&gt;
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -102,7 +102,7 @@ def test_models(
             enforce_eager=enforce_eager,
             enable_prompt_embeds=enable_prompt_embeds,
             gpu_memory_utilization=0.7,
-            async_scheduling=async_scheduling,
+            disable_async_scheduling=not async_scheduling,
             distributed_executor_backend=model_executor,
         ) as vllm_model:
             if enable_prompt_embeds:
diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
@@ -108,7 +108,7 @@ def log_engine_initialized(self):
         prompt = "This is a test of data parallel"
 
         engine_args.data_parallel_backend = data_parallel_backend
-        engine_args.async_scheduling = async_scheduling
+        engine_args.disable_async_scheduling = not async_scheduling
         engine = AsyncLLM.from_engine_args(
             engine_args, stat_loggers=[SimpleStatsLogger]
         )
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
@@ -68,7 +68,7 @@ def test_preempt_and_async_scheduling_e2e(
                         MODEL,
                         max_model_len=512,
                         enforce_eager=True,
-                        async_scheduling=async_scheduling,
+                        disable_async_scheduling=not async_scheduling,
                         distributed_executor_backend=executor,
                         dtype="float32",  # avoid precision errors
                         **cache_arg,
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -130,11 +130,12 @@ class SchedulerConfig:
     like full attention and sliding window attention.
     """
 
-    async_scheduling: bool = False
-    """EXPERIMENTAL: If set to True, perform async scheduling. This may help
-    reduce the CPU overheads, leading to better latency and throughput. However,
-    async scheduling is currently not supported with some features such as
-    structured outputs, speculative decoding, and pipeline parallelism.
+    async_scheduling: bool = True
+    """If set to True, perform async scheduling. This helps to
+    reduce the CPU overheads, leading to better latency and throughput.
+    Async scheduling is currently not supported with some features such as
+    speculative decoding and pipeline parallelism, and will be automatically
+    disabled in those case.
     """
 
     def compute_hash(self) -> str:
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -314,6 +314,17 @@ def __post_init__(self):
                 self.model_config, self.load_config
             )
 
+        if self.scheduler_config.async_scheduling:
+            if self.parallel_config.pipeline_parallel_size > 1:
+                raise ValueError(
+                    "Async scheduling is not yet compatible with "
+                    "pipeline_parallel_size > 1."
+                )
+            if self.speculative_config is not None:
+                raise ValueError(
+                    "Async scheduling is not yet compatible with speculative decoding."
+                )
+
         from vllm.platforms import current_platform
 
         if (
@@ -440,7 +451,7 @@ def __post_init__(self):
                 self.speculative_config is not None
                 and self.speculative_config.use_eagle()
             ):
-                raise NotImplementedError(
+                raise ValueError(
                     "Fast prefill optimization for KV sharing is not "
                     "compatible with EAGLE as EAGLE requires correct logits "
                     "for all tokens while fast prefill gives incorrect logits "
@@ -464,7 +475,7 @@ def __post_init__(self):
                     )
                 if not getattr(self.model_config.hf_config, "is_causal", True):
                     disable_chunked_prefill_reasons.append(
-                        "Only models using causal attention supports chunked "
+                        "Only models using causal attention support chunked "
                         "prefill and prefix caching; disabling both."
                     )
             elif self.model_config.is_encoder_decoder:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -549,8 +549,11 @@ class EngineArgs:
     )
     """Custom logitproc types"""
 
+    # DEPRECATED
     async_scheduling: bool = SchedulerConfig.async_scheduling
 
+    disable_async_scheduling: bool = not SchedulerConfig.async_scheduling
+
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
 
     def __post_init__(self):
@@ -1041,6 +1044,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--async-scheduling", **scheduler_kwargs["async_scheduling"]
         )
+        scheduler_group.add_argument(
+            "--disable-async-scheduling",
+            action=argparse.BooleanOptionalAction,
+            help="If True, disable the use of asynchronous scheduling.",
+        )
 
         # Compilation arguments
         compilation_kwargs = get_kwargs(CompilationConfig)
@@ -1491,9 +1499,22 @@ def create_engine_config(
         )
 
         if self.async_scheduling:
+            # Async scheduling was explicitly enabled (deprecated)
+            if self.disable_async_scheduling:
+                raise ValueError(
+                    "Cannot set both async_scheduling and disable_async_scheduling"
+                )
+
+            logger.warning(
+                "The async_scheduling arg is deprecated now that it is enabled "
+                "by default. Use disable_async_scheduling to disable it."
+            )
+
+            # Hard-fail compatibility checks if async scheduling
+            # was enabled *explicitly*.
             if self.pipeline_parallel_size > 1:
                 raise ValueError(
-                    "Async scheduling is not supported with pipeline-parallel-size > 1."
+                    "Async scheduling is not supported with pipeline_parallel_size > 1."
                 )
 
             # Currently, async scheduling does not support speculative decoding.
@@ -1504,6 +1525,16 @@ def create_engine_config(
                     "async scheduling."
                 )
 
+        if not self.disable_async_scheduling and (
+            self.pipeline_parallel_size > 1 or self.speculative_config is not None
+        ):
+            logger.warning(
+                "Async scheduling is not yet supported with "
+                "speculative decoding or pipeline_parallel_size > 1 "
+                "and will be disabled."
+            )
+            self.disable_async_scheduling = True
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts
@@ -1547,14 +1578,21 @@ def create_engine_config(
             _api_process_rank=self._api_process_rank,
         )
 
-        if self.async_scheduling and (
-            parallel_config.distributed_executor_backend not in ("mp", "uni")
-        ):
+        executor_supports_async_sched = (
+            parallel_config.distributed_executor_backend in ("mp", "uni")
+        )
+        if self.async_scheduling and not executor_supports_async_sched:
             raise ValueError(
                 "Currently, async scheduling only supports `mp` or `uni` "
-                "distributed executor backend, but you choose "
+                "distributed executor backend, but you chose "
                 f"`{parallel_config.distributed_executor_backend}`."
             )
+        if not self.disable_async_scheduling and not executor_supports_async_sched:
+            logger.warning(
+                "Currently, async scheduling only supports `mp` or `uni` "
+                "distributed executor backend, not `%s`, and so will be disabled."
+            )
+            self.disable_async_scheduling = True
 
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
@@ -1585,7 +1623,7 @@ def create_engine_config(
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
-            async_scheduling=self.async_scheduling,
+            async_scheduling=not self.disable_async_scheduling,
         )
 
         if not model_config.is_multimodal_model and self.default_mm_loras:

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ def log_engine_initialized(self):`
`108`	`108`	`prompt = "This is a test of data parallel"`
`109`	`109`
`110`	`110`	`engine_args.data_parallel_backend = data_parallel_backend`
`111`		`- engine_args.async_scheduling = async_scheduling`
	`111`	`+ engine_args.disable_async_scheduling = not async_scheduling`
`112`	`112`	`engine = AsyncLLM.from_engine_args(`
`113`	`113`	`engine_args, stat_loggers=[SimpleStatsLogger]`
`114`	`114`	`)`