diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py index 3ddf2308eb1d..9d527c45c1fa 100644 --- a/tests/entrypoints/openai/test_enable_force_include_usage.py +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request): # noqa: F811 "128", "--enforce-eager", "--max-num-seqs", - "1", + "4", "--enable-force-include-usage", "--port", "55857", @@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage(): "--dtype", "bfloat16", "--max-num-seqs", - "1", + "4", "--enforce-eager", "--enable-force-include-usage", "--gpu-memory-utilization", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d011dfdbfbb2..ab6e5e594c23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1046,10 +1046,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: description=SchedulerConfig.__doc__, ) scheduler_group.add_argument( - "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"] + "--max-num-batched-tokens", + **{ + **scheduler_kwargs["max_num_batched_tokens"], + "default": None, + }, ) scheduler_group.add_argument( - "--max-num-seqs", **scheduler_kwargs["max_num_seqs"] + "--max-num-seqs", + **{ + **scheduler_kwargs["max_num_seqs"], + "default": None, + }, ) scheduler_group.add_argument( "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"] @@ -1071,7 +1079,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--scheduling-policy", **scheduler_kwargs["policy"] ) scheduler_group.add_argument( - "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"] + "--enable-chunked-prefill", + **{ + **scheduler_kwargs["enable_chunked_prefill"], + "default": None, + }, ) scheduler_group.add_argument( "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]