@@ -288,10 +288,11 @@ def initialize_cache(self, num_gpu_blocks: int,
288288
289289 This also warms up the model, which may record CUDA graphs.
290290 """
291- raise_if_cache_size_invalid (num_gpu_blocks ,
292- self .cache_config .block_size ,
293- self .cache_config .is_attention_free ,
294- self .model_config .max_model_len )
291+ raise_if_cache_size_invalid (
292+ num_gpu_blocks , self .cache_config .block_size ,
293+ self .cache_config .is_attention_free ,
294+ self .model_config .max_model_len ,
295+ self .parallel_config .pipeline_parallel_size )
295296
296297 self .cache_config .num_gpu_blocks = num_gpu_blocks
297298 self .cache_config .num_cpu_blocks = num_cpu_blocks
@@ -530,7 +531,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
530531
531532
532533def raise_if_cache_size_invalid (num_gpu_blocks , block_size , is_attention_free ,
533- max_model_len ) -> None :
534+ max_model_len , pipeline_parallel_size ) -> None :
534535 if is_attention_free and num_gpu_blocks != 0 :
535536 raise ValueError ("No memory should be allocated for the cache blocks "
536537 f"for an attention-free model, but { num_gpu_blocks } "
@@ -539,7 +540,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
539540 raise ValueError ("No available memory for the cache blocks. "
540541 "Try increasing `gpu_memory_utilization` when "
541542 "initializing the engine." )
542- max_seq_len = block_size * num_gpu_blocks
543+ max_seq_len = block_size * ( num_gpu_blocks // pipeline_parallel_size )
543544 if not is_attention_free and max_model_len > max_seq_len :
544545 raise ValueError (
545546 f"The model's max seq len ({ max_model_len } ) "
0 commit comments