Skip to content

Commit eb0b086

Browse files
AllenDouzixiao
authored andcommitted
enable --gpu-memory-utilization in benchmark_throughput.py (vllm-project#3175)
Co-authored-by: zixiao <[email protected]>
1 parent 9b0a484 commit eb0b086

1 file changed

Lines changed: 14 additions & 7 deletions

File tree

benchmarks/benchmark_throughput.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def run_vllm(
7474
kv_cache_dtype: str,
7575
device: str,
7676
enable_prefix_caching: bool,
77+
gpu_memory_utilization: float = 0.9,
7778
) -> float:
7879
from vllm import LLM, SamplingParams
7980
llm = LLM(model=model,
@@ -84,6 +85,7 @@ def run_vllm(
8485
trust_remote_code=trust_remote_code,
8586
dtype=dtype,
8687
max_model_len=max_model_len,
88+
gpu_memory_utilization=gpu_memory_utilization,
8789
enforce_eager=enforce_eager,
8890
kv_cache_dtype=kv_cache_dtype,
8991
device=device,
@@ -214,13 +216,12 @@ def main(args: argparse.Namespace):
214216
args.output_len)
215217

216218
if args.backend == "vllm":
217-
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
218-
args.quantization, args.tensor_parallel_size,
219-
args.seed, args.n, args.use_beam_search,
220-
args.trust_remote_code, args.dtype,
221-
args.max_model_len, args.enforce_eager,
222-
args.kv_cache_dtype, args.device,
223-
args.enable_prefix_caching)
219+
elapsed_time = run_vllm(
220+
requests, args.model, args.tokenizer, args.quantization,
221+
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
222+
args.trust_remote_code, args.dtype, args.max_model_len,
223+
args.enforce_eager, args.kv_cache_dtype, args.device,
224+
args.enable_prefix_caching, args.gpu_memory_utilization)
224225
elif args.backend == "hf":
225226
assert args.tensor_parallel_size == 1
226227
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -295,6 +296,12 @@ def main(args: argparse.Namespace):
295296
'The "auto" option will use FP16 precision '
296297
'for FP32 and FP16 models, and BF16 precision '
297298
'for BF16 models.')
299+
parser.add_argument('--gpu-memory-utilization',
300+
type=float,
301+
default=0.9,
302+
help='the fraction of GPU memory to be used for '
303+
'the model executor, which can range from 0 to 1.'
304+
'If unspecified, will use the default value of 0.9.')
298305
parser.add_argument("--enforce-eager",
299306
action="store_true",
300307
help="enforce eager execution")

0 commit comments

Comments
 (0)