diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 9c48e7d061bf..ddb4260a05d0 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000", "--max-num-batched-tokens", "2048", "--max-num-seqs", "16", "--max-model-len", "2048"] # -- Container port containerPort: 8000 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index af546c3032af..b615ff543b3c 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -289,6 +289,19 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-reasoning requires " "--reasoning-parser") + # Ensure that --max-num-batched-tokens, --max-num-seqs, --max-model-len + # are passed within command on TPU. + from vllm.platforms import current_platform + if current_platform.is_tpu(): + if args.max_num_batched_tokens is None: + raise ValueError("Requires --max-num-batched-tokens") + + if args.max_num_seqs is None: + raise ValueError("Requires --max-num-seqs") + + if args.max_model_len is None: + raise ValueError("Requires --max-model-len") + def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser(