From b82f2127a3a0e617af9381dc80329f0b1cb10029 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 14 Nov 2025 16:17:39 +0000 Subject: [PATCH 1/3] [Chore] Rename `SchedulerConfig.chunked_prefill_enabled -> `enable_chunked_prefill`` Signed-off-by: DarkLight1337 --- tests/v1/core/test_scheduler.py | 1 - tests/v1/e2e/test_spec_decode.py | 10 ++++------ tests/v1/engine/test_engine_core.py | 2 +- vllm/config/scheduler.py | 12 ++---------- vllm/config/vllm.py | 6 +++--- vllm/platforms/cpu.py | 2 +- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/core.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 9 files changed, 15 insertions(+), 26 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 287e735b5491..04e738293cd7 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder( ) -> None: """Validate chunked prefill settings in the scheduler config for encoder-decoder models.""" - assert scheduler_config.chunked_prefill_enabled is expect_enabled assert scheduler_config.enable_chunked_prefill is expect_enabled if is_encoder_decoder: # Encoder-decoder models should automatically disable chunked multimodal diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 4a6b84ae4817..6cffaafb127e 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -272,7 +272,7 @@ def test_speculators_model_integration( @pytest.mark.parametrize( - ["model_setup", "mm_enabled", "chunked_prefill_enabled"], + ["model_setup", "mm_enabled", "enable_chunked_prefill"], [ (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), pytest.param( @@ -358,7 +358,7 @@ def test_eagle_correctness( sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], mm_enabled: bool, - chunked_prefill_enabled: bool, + enable_chunked_prefill: bool, attn_backend: str, ): if attn_backend == "TREE_ATTN": @@ -396,9 +396,7 @@ def test_eagle_correctness( method, model_name, spec_model_name, tp_size = model_setup max_model_len = 2048 - max_num_batched_tokens = max_model_len - if chunked_prefill_enabled: - max_num_batched_tokens = 128 + max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len ref_llm = LLM( model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size @@ -420,7 +418,7 @@ def test_eagle_correctness( }, max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=chunked_prefill_enabled, + enable_chunked_prefill=enable_chunked_prefill, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 4e852dca95eb..3ba8ab26f552 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache( ) # Check 5: Verify chunked prefill is disabled - assert not vllm_config.scheduler_config.chunked_prefill_enabled, ( + assert not vllm_config.scheduler_config.enable_chunked_prefill, ( "Encoder instance should disable chunked prefill (no KV cache)" ) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 5117344a6844..5e6a6c29cc3c 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -232,19 +232,11 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: self.long_prefill_token_threshold, ) - @property - def chunked_prefill_enabled(self) -> bool: - return self.enable_chunked_prefill - - @chunked_prefill_enabled.setter - def chunked_prefill_enabled(self, value: bool): - self.enable_chunked_prefill = value - @model_validator(mode="after") def _verify_args(self) -> Self: if ( self.max_num_batched_tokens < self.max_model_len - and not self.chunked_prefill_enabled + and not self.enable_chunked_prefill ): raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " @@ -271,7 +263,7 @@ def _verify_args(self) -> Self: ) if self.max_num_partial_prefills > 1: - if not self.chunked_prefill_enabled: + if not self.enable_chunked_prefill: raise ValueError( "Chunked prefill must be enabled to set " "max_num_partial_prefills > 1." diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f581267f73f7..1e6e455210c8 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -411,7 +411,7 @@ def __post_init__(self): if ( self.model_config is not None - and self.scheduler_config.chunked_prefill_enabled + and self.scheduler_config.enable_chunked_prefill and self.model_config.dtype == torch.float32 and current_platform.get_device_capability() == (7, 5) ): @@ -584,7 +584,7 @@ def __post_init__(self): ): for reason in disable_chunked_prefill_reasons: logger.info(reason) - self.scheduler_config.chunked_prefill_enabled = False + self.scheduler_config.enable_chunked_prefill = False self.scheduler_config.long_prefill_token_threshold = 0 if self.cache_config is not None: @@ -1026,7 +1026,7 @@ def __str__(self): f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " - f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa + f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, " # noqa f"pooler_config={self.model_config.pooler_config!r}, " f"compilation_config={self.compilation_config!r}" ) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index fdfa1c19789c..1da34629472c 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -192,7 +192,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: scheduler_config = vllm_config.scheduler_config if ( - scheduler_config.chunked_prefill_enabled + scheduler_config.enable_chunked_prefill or cache_config.enable_prefix_caching ) and cache_config.cache_dtype != "auto": raise RuntimeError( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4fcc7955df19..ba7ad0c09173 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -497,7 +497,7 @@ def schedule(self) -> SchedulerOutput: # chunked prefill has to be enabled explicitly to allow # pooling requests to be chunked if ( - not self.scheduler_config.chunked_prefill_enabled + not self.scheduler_config.enable_chunked_prefill and num_new_tokens > token_budget ): self.waiting.pop_request() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ffb5232e770d..a6965182fc2c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -124,7 +124,7 @@ def __init__( # Encoder models without KV cache don't support # chunked prefill. But do SSM models? logger.info("Disabling chunked prefill for model without KVCache") - vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.enable_chunked_prefill = False scheduler_block_size = ( vllm_config.cache_config.block_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 341bf58f2da8..9b3e5b668aab 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2031,7 +2031,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: supported_tasks = list(model.pooler.get_supported_tasks()) - if self.scheduler_config.chunked_prefill_enabled: + if self.scheduler_config.enable_chunked_prefill: if "token_embed" in supported_tasks: supported_tasks.remove("token_embed") if "token_classify" in supported_tasks: @@ -3825,7 +3825,7 @@ def _dummy_pooler_run( supported_pooling_tasks = self.get_supported_pooling_tasks() if not supported_pooling_tasks: - if self.scheduler_config.chunked_prefill_enabled: + if self.scheduler_config.enable_chunked_prefill: raise RuntimeError( f"Model {self.model_config.model} does not support " "any pooling tasks with chunked prefill enabled. " From 0641fb3ac7b2cfdab51bf36cb2dd2594da84a3ba Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 14 Nov 2025 16:22:47 +0000 Subject: [PATCH 2/3] Add deprecation notice Signed-off-by: DarkLight1337 --- vllm/config/scheduler.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 5e6a6c29cc3c..13e347769365 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -8,7 +8,7 @@ from pydantic import Field, field_validator, model_validator from pydantic.dataclasses import dataclass -from typing_extensions import Self +from typing_extensions import Self, deprecated from vllm.config.utils import config from vllm.logger import init_logger @@ -199,6 +199,19 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: return value return handler(value) + @property + @deprecated( + "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " + "`SchedulerConfig.enable_chunked_prefill`. " + "The old name will be removed in v0.12." + ) + def chunked_prefill_enabled(self) -> bool: + return self.enable_chunked_prefill + + @chunked_prefill_enabled.setter + def chunked_prefill_enabled(self, value: bool): + self.enable_chunked_prefill = value + def __post_init__(self, is_encoder_decoder: bool) -> None: if is_encoder_decoder: # Chunked prefill should be disabled for encoder-decoder models. From 74afec7a34c2916702f9a16cd3c6de5660bf174c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 14 Nov 2025 16:23:45 +0000 Subject: [PATCH 3/3] Restore Signed-off-by: DarkLight1337 --- vllm/config/scheduler.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 13e347769365..444568994a95 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -199,19 +199,6 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: return value return handler(value) - @property - @deprecated( - "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " - "`SchedulerConfig.enable_chunked_prefill`. " - "The old name will be removed in v0.12." - ) - def chunked_prefill_enabled(self) -> bool: - return self.enable_chunked_prefill - - @chunked_prefill_enabled.setter - def chunked_prefill_enabled(self, value: bool): - self.enable_chunked_prefill = value - def __post_init__(self, is_encoder_decoder: bool) -> None: if is_encoder_decoder: # Chunked prefill should be disabled for encoder-decoder models. @@ -245,6 +232,19 @@ def __post_init__(self, is_encoder_decoder: bool) -> None: self.long_prefill_token_threshold, ) + @property + @deprecated( + "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " + "`SchedulerConfig.enable_chunked_prefill`. " + "The old name will be removed in v0.12." + ) + def chunked_prefill_enabled(self) -> bool: + return self.enable_chunked_prefill + + @chunked_prefill_enabled.setter + def chunked_prefill_enabled(self, value: bool): + self.enable_chunked_prefill = value + @model_validator(mode="after") def _verify_args(self) -> Self: if (