Change profile Run batch based on max_seq_len (vllm-project#415)

hlahkar · web-flow · commit 892c09026833 · 2024-10-23T11:33:53.000+02:00
Changes the profile_run batches based on the max sequence length. This avoids padding during prepare_prompt; thus avoiding breaking constraints based on batch_size * seq_len <= max_num_batch_tokens. Current logic for profile_run max_batch_size takes precedence. e.g. - > max_batch_size = 256, max_num_batch_tokens = 2048, block_size = 128, max_seq_len = 1024 with current logic max_seq_len is updated as 8; however in **prepare_prompt** seq_len is padded to 128, thus getting batch_size * seq_len as 256 * 128 > max_num_batch_tokens; thus violating the above mentioned constraint with the updated logic, we calculate max_batch_size as 2, this avoids the padding at **prepare_prompt**, thus keeping the constraints in place. Fixes: HabanaAI#405
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -1306,10 +1306,8 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
-        max_seq_len = min(
-            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
-            self.max_num_batched_tokens // max_batch_size)
+        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
+        max_batch_size = self.max_num_batched_tokens // max_seq_len
 
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
                              False, True)