@@ -74,6 +74,7 @@ def run_vllm(
7474 kv_cache_dtype : str ,
7575 device : str ,
7676 enable_prefix_caching : bool ,
77+ gpu_memory_utilization : float = 0.9 ,
7778) -> float :
7879 from vllm import LLM , SamplingParams
7980 llm = LLM (model = model ,
@@ -84,6 +85,7 @@ def run_vllm(
8485 trust_remote_code = trust_remote_code ,
8586 dtype = dtype ,
8687 max_model_len = max_model_len ,
88+ gpu_memory_utilization = gpu_memory_utilization ,
8789 enforce_eager = enforce_eager ,
8890 kv_cache_dtype = kv_cache_dtype ,
8991 device = device ,
@@ -214,13 +216,12 @@ def main(args: argparse.Namespace):
214216 args .output_len )
215217
216218 if args .backend == "vllm" :
217- elapsed_time = run_vllm (requests , args .model , args .tokenizer ,
218- args .quantization , args .tensor_parallel_size ,
219- args .seed , args .n , args .use_beam_search ,
220- args .trust_remote_code , args .dtype ,
221- args .max_model_len , args .enforce_eager ,
222- args .kv_cache_dtype , args .device ,
223- args .enable_prefix_caching )
219+ elapsed_time = run_vllm (
220+ requests , args .model , args .tokenizer , args .quantization ,
221+ args .tensor_parallel_size , args .seed , args .n , args .use_beam_search ,
222+ args .trust_remote_code , args .dtype , args .max_model_len ,
223+ args .enforce_eager , args .kv_cache_dtype , args .device ,
224+ args .enable_prefix_caching , args .gpu_memory_utilization )
224225 elif args .backend == "hf" :
225226 assert args .tensor_parallel_size == 1
226227 elapsed_time = run_hf (requests , args .model , tokenizer , args .n ,
@@ -295,6 +296,12 @@ def main(args: argparse.Namespace):
295296 'The "auto" option will use FP16 precision '
296297 'for FP32 and FP16 models, and BF16 precision '
297298 'for BF16 models.' )
299+ parser .add_argument ('--gpu-memory-utilization' ,
300+ type = float ,
301+ default = 0.9 ,
302+ help = 'the fraction of GPU memory to be used for '
303+ 'the model executor, which can range from 0 to 1.'
304+ 'If unspecified, will use the default value of 0.9.' )
298305 parser .add_argument ("--enforce-eager" ,
299306 action = "store_true" ,
300307 help = "enforce eager execution" )
0 commit comments