fix_attention_free_models

hl475 · hl475 · commit a92ec361faae · 2025-10-30T23:39:51.000-07:00
Signed-off-by: Huamin Li &lt;3ericli@gmail.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -4008,8 +4008,14 @@ def create_attn_groups(
         for attn_backends_map in attention_backend_maps:
             self.attn_groups.append(create_attn_groups(attn_backends_map))
 
-        # Calculate reorder batch threshold (if needed)
-        self.calculate_reorder_batch_threshold()
+        # Calculate reorder batch threshold (if needed).
+        # For attention-free models there will be no attention groups; in that case
+        # there is nothing to reorder.
+        if any(len(groups) for groups in self.attn_groups):
+            self.calculate_reorder_batch_threshold()
+        else:
+            # Disable reordering explicitly to make intent clear to later call sites.
+            self.reorder_batch_threshold = None
 
     def _check_and_update_cudagraph_mode(
         self, attention_backends: set[type[AttentionBackend]]
@@ -4149,6 +4155,11 @@ def calculate_reorder_batch_threshold(self) -> None:
             group.get_metadata_builder().reorder_batch_threshold
             for group in self._attn_group_iterator()
         ]
+        # If there are no attention groups (attention-free model) or no backend
+        # reports a threshold, leave reordering disabled.
+        if not reorder_batch_thresholds:
+            self.reorder_batch_threshold = None
+            return
         self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)
 
     def _find_compatible_block_sizes(