Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3124,6 +3124,13 @@
# derived length from the HF model config.
if max_model_len is None:
max_model_len = int(derived_max_model_len)
if current_platform.is_tpu():
logger.warning("--max-model-len is not specified, "
"it's currently using model's default length "
f"{max_model_len}, which might be too large."
"Please input with --max-model-len based on your "
"request input length and output length, to avoid "
"unnecessary degradation.")

Check failure on line 3133 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

vllm/config.py:3128:28: G004 Logging statement uses f-string
elif max_model_len > derived_max_model_len:
# Some models might have a separate key for specifying model_max_length
# that will be bigger than derived_max_model_len. We compare user input
Expand Down
31 changes: 28 additions & 3 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1432,8 +1432,8 @@
# as the platform that vLLM is running on (e.g. the case of scaling
# vLLM with Ray) and has no GPUs. In this case we use the default
# values for non-H100/H200 GPUs.
from vllm.platforms import current_platform
try:
from vllm.platforms import current_platform
device_memory = current_platform.get_device_total_memory()
except Exception:
# This is only used to set default_max_num_batched_tokens
Expand All @@ -1454,11 +1454,36 @@
}
default_max_num_seqs = 256

# tpu specific default values.
if current_platform.is_tpu():
default_max_num_batched_tokens_tpu = {
UsageContext.LLM_CLASS: {
'v6e': 2048,
'v5e': 1024,
'v5p': 512,
},
UsageContext.OPENAI_API_SERVER: {
'v6e': 1024,
'v5e': 512,
'v5p': 256,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is p smaller than e? It has more flops and bw.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, see the above table, the peak AI of v5p is smaller than v5e.

}
}

use_context_value = usage_context.value if usage_context else None
if (self.max_num_batched_tokens is None
and usage_context in default_max_num_batched_tokens):
self.max_num_batched_tokens = default_max_num_batched_tokens[
usage_context]
if current_platform.is_tpu():
from tpu_info import device
try:
chip_type, _ = device.get_local_chips()
self.max_num_batched_tokens = default_max_num_batched_tokens_tpu[

Check failure on line 1479 in vllm/engine/arg_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/engine/arg_utils.py:1479:81: E501 Line too long (85 > 80)
usage_context][chip_type.name.lower()]
except Exception:
self.max_num_batched_tokens = default_max_num_batched_tokens[

Check failure on line 1482 in vllm/engine/arg_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/engine/arg_utils.py:1482:81: E501 Line too long (81 > 80)
usage_context]
else:
self.max_num_batched_tokens = default_max_num_batched_tokens[
usage_context]
logger.debug(
"Setting max_num_batched_tokens to %d for %s usage context.",
self.max_num_batched_tokens, use_context_value)
Expand Down
Loading