11# SPDX-License-Identifier: Apache-2.0
22"""Benchmark the latency of processing a single batch of requests."""
3+
34import argparse
45import dataclasses
56import json
7+ import os
68import time
79from pathlib import Path
8- from typing import List , Optional
10+ from typing import Any , Dict , List , Optional
911
1012import numpy as np
1113import torch
14+ from benchmark_utils import convert_to_pytorch_benchmark_format
1215from tqdm import tqdm
1316
1417from vllm import LLM , SamplingParams
1821from vllm .utils import FlexibleArgumentParser
1922
2023
24+ def save_to_pytorch_benchmark_format (args : argparse .Namespace ,
25+ results : Dict [str , Any ]) -> None :
26+ pt_records = convert_to_pytorch_benchmark_format (
27+ args = args ,
28+ metrics = {"latency" : results ["latencies" ]},
29+ extra_info = {k : results [k ]
30+ for k in ["avg_latency" , "percentiles" ]})
31+ if pt_records :
32+ pt_file = f"{ os .path .splitext (args .output_json )[0 ]} .pytorch.json"
33+ with open (pt_file , "w" ) as f :
34+ json .dump (pt_records , f )
35+
36+
2137def main (args : argparse .Namespace ):
2238 print (args )
2339
@@ -54,7 +70,8 @@ def llm_generate():
5470 beam_width = args .n ,
5571 max_tokens = args .output_len ,
5672 ignore_eos = True ,
57- ))
73+ ),
74+ )
5875
5976 def run_to_completion (profile_dir : Optional [str ] = None ):
6077 if profile_dir :
@@ -64,7 +81,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
6481 torch .profiler .ProfilerActivity .CUDA ,
6582 ],
6683 on_trace_ready = torch .profiler .tensorboard_trace_handler (
67- str (profile_dir ))) as p :
84+ str (profile_dir )),
85+ ) as p :
6886 llm_generate ()
6987 print (p .key_averages ().table (sort_by = "self_cuda_time_total" ))
7088 else :
@@ -81,9 +99,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
8199 if args .profile :
82100 profile_dir = args .profile_result_dir
83101 if not profile_dir :
84- profile_dir = Path (
85- "."
86- ) / "vllm_benchmark_result" / f"latency_result_{ time .time ()} "
102+ profile_dir = (Path ("." ) / "vllm_benchmark_result" /
103+ f"latency_result_{ time .time ()} " )
87104 print (f"Profiling (results will be saved to '{ profile_dir } ')..." )
88105 run_to_completion (profile_dir = profile_dir )
89106 return
@@ -95,9 +112,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
95112 latencies = np .array (latencies )
96113 percentages = [10 , 25 , 50 , 75 , 90 , 99 ]
97114 percentiles = np .percentile (latencies , percentages )
98- print (f' Avg latency: { np .mean (latencies )} seconds' )
115+ print (f" Avg latency: { np .mean (latencies )} seconds" )
99116 for percentage , percentile in zip (percentages , percentiles ):
100- print (f' { percentage } % percentile latency: { percentile } seconds' )
117+ print (f" { percentage } % percentile latency: { percentile } seconds" )
101118
102119 # Output JSON results if specified
103120 if args .output_json :
@@ -108,43 +125,51 @@ def run_to_completion(profile_dir: Optional[str] = None):
108125 }
109126 with open (args .output_json , "w" ) as f :
110127 json .dump (results , f , indent = 4 )
128+ save_to_pytorch_benchmark_format (args , results )
111129
112130
113- if __name__ == ' __main__' :
131+ if __name__ == " __main__" :
114132 parser = FlexibleArgumentParser (
115- description = 'Benchmark the latency of processing a single batch of '
116- 'requests till completion.' )
117- parser .add_argument ('--input-len' , type = int , default = 32 )
118- parser .add_argument ('--output-len' , type = int , default = 128 )
119- parser .add_argument ('--batch-size' , type = int , default = 8 )
120- parser .add_argument ('--n' ,
121- type = int ,
122- default = 1 ,
123- help = 'Number of generated sequences per prompt.' )
124- parser .add_argument ('--use-beam-search' , action = 'store_true' )
125- parser .add_argument ('--num-iters-warmup' ,
126- type = int ,
127- default = 10 ,
128- help = 'Number of iterations to run for warmup.' )
129- parser .add_argument ('--num-iters' ,
133+ description = "Benchmark the latency of processing a single batch of "
134+ "requests till completion." )
135+ parser .add_argument ("--input-len" , type = int , default = 32 )
136+ parser .add_argument ("--output-len" , type = int , default = 128 )
137+ parser .add_argument ("--batch-size" , type = int , default = 8 )
138+ parser .add_argument (
139+ "--n" ,
140+ type = int ,
141+ default = 1 ,
142+ help = "Number of generated sequences per prompt." ,
143+ )
144+ parser .add_argument ("--use-beam-search" , action = "store_true" )
145+ parser .add_argument (
146+ "--num-iters-warmup" ,
147+ type = int ,
148+ default = 10 ,
149+ help = "Number of iterations to run for warmup." ,
150+ )
151+ parser .add_argument ("--num-iters" ,
130152 type = int ,
131153 default = 30 ,
132- help = ' Number of iterations to run.' )
154+ help = " Number of iterations to run." )
133155 parser .add_argument (
134- '--profile' ,
135- action = 'store_true' ,
136- help = 'profile the generation process of a single batch' )
156+ "--profile" ,
157+ action = "store_true" ,
158+ help = "profile the generation process of a single batch" ,
159+ )
137160 parser .add_argument (
138- ' --profile-result-dir' ,
161+ " --profile-result-dir" ,
139162 type = str ,
140163 default = None ,
141- help = ('path to save the pytorch profiler output. Can be visualized '
142- 'with ui.perfetto.dev or Tensorboard.' ))
164+ help = ("path to save the pytorch profiler output. Can be visualized "
165+ "with ui.perfetto.dev or Tensorboard." ),
166+ )
143167 parser .add_argument (
144- ' --output-json' ,
168+ " --output-json" ,
145169 type = str ,
146170 default = None ,
147- help = 'Path to save the latency results in JSON format.' )
171+ help = "Path to save the latency results in JSON format." ,
172+ )
148173
149174 parser = EngineArgs .add_cli_args (parser )
150175 args = parser .parse_args ()
0 commit comments