Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,27 @@ def create_engine_config(
disable_log_stats=self.disable_log_stats,
)

# Make sure the draft model's max_model_len is not less than
# the deployment's max_model_len.
# In V1 there is no way to disable requests when the sequence length
# exceeds the draft model's max_model_len, which can lead to crashes.
effective_max_model_len = self.max_model_len
if effective_max_model_len is None:
effective_max_model_len = model_config.max_model_len
if use_v1 and speculative_config is not None and \
effective_max_model_len is not None and \
speculative_config.draft_model_config is not None and \
speculative_config.draft_model_config.max_model_len is not None:
draft_max_model_len = \
speculative_config.draft_model_config.max_model_len
if draft_max_model_len < effective_max_model_len:
raise ValueError(
"The draft model config's max_model_len "
f"({draft_max_model_len}) "
"is less than the deployment's max_model_len "
f"({effective_max_model_len})."
"--max-model-len should be decreased to match.")

Comment on lines +1314 to +1334
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit - I think it makes more sense to have this code inside create_speculative_config - WDYT?

# make sure num_lookahead_slots is set appropriately depending on
# whether speculative decoding is enabled
num_lookahead_slots = self.num_lookahead_slots
Expand Down