Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,15 @@ def __init__(
self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
cache_config.cache_dtype]

self.is_multimodal_model = model_config.is_multimodal_model
# NOTE(woosuk): sliding_window is None for models with interleaved
# attention. Use interleaved_sliding_window instead.
self.sliding_window = model_config.get_sliding_window()
self.interleaved_sliding_window = getattr(
model_config.hf_text_config, "interleaved_sliding_window", None)
self.window_size = (self.sliding_window
or self.interleaved_sliding_window)

self.is_multimodal_model = model_config.is_multimodal_model
self.block_size = cache_config.block_size
self.max_model_len = model_config.max_model_len
self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
Expand Down Expand Up @@ -674,7 +681,7 @@ def _compute_cascade_attn_prefix_len(
num_query_heads=self.num_query_heads,
num_kv_heads=self.num_kv_heads,
use_alibi=False, # FIXME
use_sliding_window=self.sliding_window is not None,
use_sliding_window=self.window_size is not None,
num_sms=self.num_sms,
)
return common_prefix_len if use_cascade else 0
Expand Down