|
16 | 16 | from vllm import LLM, SamplingParams |
17 | 17 | from vllm.engine.arg_utils import EngineArgs |
18 | 18 | from vllm.inputs import PromptType |
| 19 | +from vllm.sampling_params import BeamSearchParams |
19 | 20 | from vllm.utils import FlexibleArgumentParser |
20 | 21 |
|
21 | 22 |
|
@@ -87,37 +88,47 @@ def get_profiling_context(profile_result_dir: Optional[str] = None): |
87 | 88 | dummy_prompt_token_ids = np.random.randint(10000, |
88 | 89 | size=(args.batch_size, |
89 | 90 | args.input_len)) |
90 | | - dummy_inputs: List[PromptType] = [{ |
| 91 | + dummy_prompts: List[PromptType] = [{ |
91 | 92 | "prompt_token_ids": batch |
92 | 93 | } for batch in dummy_prompt_token_ids.tolist()] |
93 | 94 |
|
94 | | - def run_to_completion(profile_result_dir: Optional[str] = None): |
95 | | - if profile_result_dir: |
96 | | - with get_profiling_context(profile_result_dir): |
97 | | - llm.generate(dummy_inputs, |
98 | | - sampling_params=sampling_params, |
99 | | - use_tqdm=False) |
100 | | - else: |
101 | | - start_time = time.perf_counter() |
102 | | - llm.generate(dummy_inputs, |
| 95 | + def llm_generate(): |
| 96 | + if not args.use_beam_search: |
| 97 | + llm.generate(dummy_prompts, |
103 | 98 | sampling_params=sampling_params, |
104 | 99 | use_tqdm=False) |
| 100 | + else: |
| 101 | + llm.beam_search( |
| 102 | + dummy_prompts, |
| 103 | + BeamSearchParams( |
| 104 | + beam_width=args.n, |
| 105 | + max_tokens=args.output_len, |
| 106 | + ignore_eos=True, |
| 107 | + )) |
| 108 | + |
| 109 | + def run_to_completion(profile_dir: Optional[str] = None): |
| 110 | + if profile_dir: |
| 111 | + with get_profiling_context(profile_dir): |
| 112 | + llm_generate() |
| 113 | + else: |
| 114 | + start_time = time.perf_counter() |
| 115 | + llm_generate() |
105 | 116 | end_time = time.perf_counter() |
106 | 117 | latency = end_time - start_time |
107 | 118 | return latency |
108 | 119 |
|
109 | 120 | print("Warming up...") |
110 | 121 | for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): |
111 | | - run_to_completion(profile_result_dir=None) |
| 122 | + run_to_completion(profile_dir=None) |
112 | 123 |
|
113 | 124 | if args.profile_torch or args.profile_rpd: |
114 | | - run_to_completion(profile_result_dir=profile_result_dir) |
| 125 | + run_to_completion(profile_dir=profile_result_dir) |
115 | 126 | return |
116 | 127 |
|
117 | 128 | # Benchmark. |
118 | 129 | latencies = [] |
119 | 130 | for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): |
120 | | - latencies.append(run_to_completion(profile_result_dir=None)) |
| 131 | + latencies.append(run_to_completion(profile_dir=None)) |
121 | 132 | latencies = np.array(latencies) |
122 | 133 | percentages = [10, 25, 50, 75, 90, 99] |
123 | 134 | percentiles = np.percentile(latencies, percentages) |
|
0 commit comments