1919import argparse
2020import asyncio
2121import contextlib
22- import gc
2322import importlib .util
2423import json
2524import os
4948from vllm .benchmarks .lib .ready_checker import wait_for_endpoint
5049from vllm .benchmarks .lib .utils import convert_to_pytorch_benchmark_format , write_to_json
5150from vllm .transformers_utils .tokenizer import get_tokenizer
51+ from vllm .utils .gc_utils import freeze_gc_heap
5252
5353MILLISECONDS_TO_SECONDS_CONVERSION = 1000
5454
@@ -94,10 +94,6 @@ class BenchmarkMetrics:
9494 # Max output tokens per second and concurrent requests at that peak
9595 max_output_tokens_per_s : float
9696 max_concurrent_requests : int
97- # TPS per user - average tokens per second per request
98- tps_per_user : float
99- # TPS per GPU - output throughput divided by number of GPUs
100- tps_per_gpu : float
10197
10298
10399@dataclass
@@ -292,7 +288,6 @@ def calculate_metrics(
292288 tokenizer : PreTrainedTokenizerBase ,
293289 selected_percentiles : list [float ],
294290 goodput_config_dict : dict [str , float ],
295- num_gpu : int = 1 ,
296291) -> tuple [BenchmarkMetrics , list [int ]]:
297292 """Calculate the metrics for the benchmark.
298293
@@ -316,7 +311,6 @@ def calculate_metrics(
316311 all_tpots : list [float ] = []
317312 ttfts : list [float ] = []
318313 e2els : list [float ] = []
319- tps_per_user : list [float ] = []
320314 for i in range (len (outputs )):
321315 if outputs [i ].success :
322316 output_len = outputs [i ].output_tokens
@@ -344,10 +338,6 @@ def calculate_metrics(
344338 itls += outputs [i ].itl
345339 ttfts .append (outputs [i ].ttft )
346340 e2els .append (outputs [i ].latency )
347-
348- # Calculate TPS per request: (input_tokens + output_tokens) / latency
349- tps_per_user .append (output_len / (outputs [i ].latency ))
350-
351341 completed += 1
352342 else :
353343 actual_output_lens .append (0 )
@@ -486,8 +476,6 @@ def calculate_metrics(
486476 ],
487477 max_output_tokens_per_s = max_output_tokens_per_s ,
488478 max_concurrent_requests = max_concurrent_requests ,
489- tps_per_user = np .mean (tps_per_user ) if tps_per_user else 0.0 ,
490- tps_per_gpu = sum (actual_output_lens ) / dur_s / num_gpu if num_gpu > 0 else 0.0 ,
491479 )
492480
493481 return metrics , actual_output_lens
@@ -520,7 +508,6 @@ async def benchmark(
520508 ramp_up_start_rps : int | None = None ,
521509 ramp_up_end_rps : int | None = None ,
522510 ready_check_timeout_sec : int = 600 ,
523- num_gpu : int = 1 ,
524511):
525512 try :
526513 request_func = ASYNC_REQUEST_FUNCS [endpoint_type ]
@@ -752,7 +739,6 @@ async def limited_request_func(request_func_input, session, pbar):
752739 tokenizer = tokenizer ,
753740 selected_percentiles = selected_percentiles ,
754741 goodput_config_dict = goodput_config_dict ,
755- num_gpu = num_gpu ,
756742 )
757743 else :
758744 metrics = calculate_metrics_for_embeddings (
@@ -805,11 +791,6 @@ async def limited_request_func(request_func_input, session, pbar):
805791 "Total Token throughput (tok/s):" , metrics .total_token_throughput
806792 )
807793 )
808- if isinstance (metrics , BenchmarkMetrics ):
809- print ("{:<40} {:<10.2f}" .format ("TPS per user (tok/s):" ,
810- metrics .tps_per_user ))
811- print ("{:<40} {:<10.2f}" .format ("TPS per GPU (tok/s):" ,
812- metrics .tps_per_gpu ))
813794
814795 if isinstance (metrics , BenchmarkMetrics ):
815796 result = {
@@ -830,8 +811,6 @@ async def limited_request_func(request_func_input, session, pbar):
830811 "errors" : [output .error for output in outputs ],
831812 "max_output_tokens_per_s" : metrics .max_output_tokens_per_s ,
832813 "max_concurrent_requests" : metrics .max_concurrent_requests ,
833- "tps_per_user" : metrics .tps_per_user ,
834- "tps_per_gpu" : metrics .tps_per_gpu ,
835814 }
836815 else :
837816 result = {
@@ -1157,7 +1136,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
11571136 parser .add_argument (
11581137 "--percentile-metrics" ,
11591138 type = str ,
1160- default = "ttft,tpot,itl,e2el" ,
1139+ default = None ,
11611140 help = "Comma-separated list of selected metrics to report percentils. "
11621141 "This argument specifies the metrics to report percentiles. "
11631142 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
@@ -1304,12 +1283,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
13041283 "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
13051284 "the ready check will be skipped." ,
13061285 )
1307- parser .add_argument (
1308- "--num-gpu" ,
1309- type = int ,
1310- default = 1 ,
1311- help = "Number of GPUs used for serving. Used to calculate TPS per GPU metric."
1312- )
13131286
13141287 parser .add_argument (
13151288 "--extra-body" ,
@@ -1441,8 +1414,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
14411414 percentile_metrics : str = args .percentile_metrics or default_percentile_metrics
14421415
14431416 # Avoid GC processing "static" data - reduce pause times.
1444- gc .collect ()
1445- gc .freeze ()
1417+ freeze_gc_heap ()
14461418
14471419 benchmark_result = await benchmark (
14481420 task_type = task_type ,
@@ -1471,7 +1443,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
14711443 ramp_up_start_rps = args .ramp_up_start_rps ,
14721444 ramp_up_end_rps = args .ramp_up_end_rps ,
14731445 ready_check_timeout_sec = args .ready_check_timeout_sec ,
1474- num_gpu = args .num_gpu ,
14751446 )
14761447
14771448 # Save config and results to json
@@ -1555,4 +1526,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
15551526 json .dump (result_json , outfile )
15561527 save_to_pytorch_benchmark_format (args , result_json , file_name )
15571528
1558- return result_json
1529+ return result_json
0 commit comments