Skip to content

Commit d028e28

Browse files
zhengy001lulmer
authored andcommitted
[PP] Correct cache size check (vllm-project#13873)
Signed-off-by: Yang Zheng <[email protected]> Signed-off-by: Louis Ulmer <[email protected]>
1 parent af03641 commit d028e28

2 files changed

Lines changed: 14 additions & 12 deletions

File tree

vllm/worker/hpu_worker.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,10 @@ def initialize_cache(self, num_gpu_blocks: int,
258258
259259
This also warms up the model, which may record CUDA graphs.
260260
"""
261-
raise_if_cache_size_invalid(num_gpu_blocks,
262-
self.cache_config.block_size,
263-
self.model_config.max_model_len)
261+
raise_if_cache_size_invalid(
262+
num_gpu_blocks, self.cache_config.block_size,
263+
self.model_config.max_model_len,
264+
self.parallel_config.pipeline_parallel_size)
264265

265266
self.cache_config.num_gpu_blocks = num_gpu_blocks
266267
self.cache_config.num_cpu_blocks = num_cpu_blocks
@@ -442,13 +443,13 @@ def init_worker_distributed_environment(
442443
parallel_config.pipeline_parallel_size)
443444

444445

445-
def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
446-
max_model_len) -> None:
446+
def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len,
447+
pipeline_parallel_size) -> None:
447448
if num_gpu_blocks <= 0:
448449
raise ValueError("No available memory for the cache blocks. "
449450
"Try increasing `gpu_memory_utilization` when "
450451
"initializing the engine.")
451-
max_seq_len = block_size * num_gpu_blocks
452+
max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
452453
if max_model_len > max_seq_len:
453454
raise ValueError(
454455
f"The model's max seq len ({max_model_len}) "

vllm/worker/worker.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,11 @@ def initialize_cache(self, num_gpu_blocks: int,
288288
289289
This also warms up the model, which may record CUDA graphs.
290290
"""
291-
raise_if_cache_size_invalid(num_gpu_blocks,
292-
self.cache_config.block_size,
293-
self.cache_config.is_attention_free,
294-
self.model_config.max_model_len)
291+
raise_if_cache_size_invalid(
292+
num_gpu_blocks, self.cache_config.block_size,
293+
self.cache_config.is_attention_free,
294+
self.model_config.max_model_len,
295+
self.parallel_config.pipeline_parallel_size)
295296

296297
self.cache_config.num_gpu_blocks = num_gpu_blocks
297298
self.cache_config.num_cpu_blocks = num_cpu_blocks
@@ -530,7 +531,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
530531

531532

532533
def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
533-
max_model_len) -> None:
534+
max_model_len, pipeline_parallel_size) -> None:
534535
if is_attention_free and num_gpu_blocks != 0:
535536
raise ValueError("No memory should be allocated for the cache blocks "
536537
f"for an attention-free model, but {num_gpu_blocks} "
@@ -539,7 +540,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
539540
raise ValueError("No available memory for the cache blocks. "
540541
"Try increasing `gpu_memory_utilization` when "
541542
"initializing the engine.")
542-
max_seq_len = block_size * num_gpu_blocks
543+
max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
543544
if not is_attention_free and max_model_len > max_seq_len:
544545
raise ValueError(
545546
f"The model's max seq len ({max_model_len}) "

0 commit comments

Comments
 (0)