Skip to content

Commit 9917cda

Browse files
AdrianAbeytaAdrianAbeyta
andauthored
Update Benchmark Profiling Scripts (#417)
* Update profiling benchmarks * Fix linter errors --------- Co-authored-by: AdrianAbeyta <[email protected]>
1 parent e2dc610 commit 9917cda

File tree

2 files changed

+263
-64
lines changed

2 files changed

+263
-64
lines changed

benchmarks/profiling/benchmark_latency.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from vllm import LLM, SamplingParams
1717
from vllm.engine.arg_utils import EngineArgs
1818
from vllm.inputs import PromptType
19+
from vllm.sampling_params import BeamSearchParams
1920
from vllm.utils import FlexibleArgumentParser
2021

2122

@@ -87,37 +88,47 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
8788
dummy_prompt_token_ids = np.random.randint(10000,
8889
size=(args.batch_size,
8990
args.input_len))
90-
dummy_inputs: List[PromptType] = [{
91+
dummy_prompts: List[PromptType] = [{
9192
"prompt_token_ids": batch
9293
} for batch in dummy_prompt_token_ids.tolist()]
9394

94-
def run_to_completion(profile_result_dir: Optional[str] = None):
95-
if profile_result_dir:
96-
with get_profiling_context(profile_result_dir):
97-
llm.generate(dummy_inputs,
98-
sampling_params=sampling_params,
99-
use_tqdm=False)
100-
else:
101-
start_time = time.perf_counter()
102-
llm.generate(dummy_inputs,
95+
def llm_generate():
96+
if not args.use_beam_search:
97+
llm.generate(dummy_prompts,
10398
sampling_params=sampling_params,
10499
use_tqdm=False)
100+
else:
101+
llm.beam_search(
102+
dummy_prompts,
103+
BeamSearchParams(
104+
beam_width=args.n,
105+
max_tokens=args.output_len,
106+
ignore_eos=True,
107+
))
108+
109+
def run_to_completion(profile_dir: Optional[str] = None):
110+
if profile_dir:
111+
with get_profiling_context(profile_dir):
112+
llm_generate()
113+
else:
114+
start_time = time.perf_counter()
115+
llm_generate()
105116
end_time = time.perf_counter()
106117
latency = end_time - start_time
107118
return latency
108119

109120
print("Warming up...")
110121
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
111-
run_to_completion(profile_result_dir=None)
122+
run_to_completion(profile_dir=None)
112123

113124
if args.profile_torch or args.profile_rpd:
114-
run_to_completion(profile_result_dir=profile_result_dir)
125+
run_to_completion(profile_dir=profile_result_dir)
115126
return
116127

117128
# Benchmark.
118129
latencies = []
119130
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
120-
latencies.append(run_to_completion(profile_result_dir=None))
131+
latencies.append(run_to_completion(profile_dir=None))
121132
latencies = np.array(latencies)
122133
percentages = [10, 25, 50, 75, 90, 99]
123134
percentiles = np.percentile(latencies, percentages)

0 commit comments

Comments
 (0)