[Bugfix] Limit profiling run sequence length by max_model_len (vllm-project#14785)

kylesayrs · DefTruth · commit 40e7f44f95e3 · 2025-03-17T11:47:31.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
Signed-off-by: DefTruth &lt;31974251+DefTruth@users.noreply.github.com&gt;
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
@@ -330,6 +330,11 @@ def dummy_data_for_profiling(
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
 
+        if seq_len > model_config.max_model_len:
+            raise AssertionError(
+                f"Profiling attempted with sequence length ({seq_len}) "
+                f"greater than model length ({model_config.max_model_len})")
+
         if mm_registry.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
@@ -281,6 +281,7 @@ def profile_run(self) -> None:
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_len = min(seq_len, self.model_config.max_model_len)
             batch_size += seq_len
 
             decoder_dummy_data = self.input_registry \
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1302,6 +1302,7 @@ def _dummy_run(self,
             for group_id in range(max_num_seqs):
                 seq_len = (max_num_batched_tokens // max_num_seqs +
                            (group_id < max_num_batched_tokens % max_num_seqs))
+                seq_len = min(seq_len, self.model_config.max_model_len)
                 batch_size += seq_len
 
                 dummy_data = self.input_registry \
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
@@ -148,6 +148,7 @@ def _prepare_model_input(
                 seq_len = min(
                     seq_data.get_len(),
                     computed_len + seq_group_metadata.token_chunk_size,
+                    self.model_config.max_model_len,
                 )
                 if is_prompt:
                     tokens = seq_data.get_token_ids()[computed_len:seq_len]
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
@@ -466,6 +466,7 @@ def profile_run(self) -> None:
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_len = min(seq_len, self.model_config.max_model_len)
             batch_size += seq_len
 
             dummy_data = self.input_registry \

Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,7 @@ def _prepare_model_input(`
`148`	`148`	`seq_len = min(`
`149`	`149`	`seq_data.get_len(),`
`150`	`150`	`computed_len + seq_group_metadata.token_chunk_size,`
	`151`	`+ self.model_config.max_model_len,`
`151`	`152`	`)`
`152`	`153`	`if is_prompt:`
`153`	`154`	`tokens = seq_data.get_token_ids()[computed_len:seq_len]`