review

NickLucche · NickLucche · commit f3b6244056b3 · 2025-11-09T12:24:18.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -204,36 +204,6 @@ def _init_kv_cache_quant(
         layer.quant_method.create_weights(layer)
 
 
-def get_attention_context(
-    layer_name: str,
-) -> tuple[dict | object | None, "Attention | MLAAttention", torch.Tensor]:
-    """Extract attention context for a given layer.
-
-    This helper function extracts the attention metadata, attention layer
-    instance, and KV cache tensor for a specific layer.
-
-    Args:
-        layer_name: The name/identifier of the attention layer.
-
-    Returns:
-        A tuple containing:
-        - attn_metadata: Attention metadata for this specific layer, or None if
-          no metadata available
-        - attn_layer: The attention layer instance (Attention or MLAAttention)
-        - kv_cache: The KV cache tensor for current virtual engine
-
-        Note: attn_metadata may be None, but attn_layer and kv_cache are always
-        extracted from the forward context.
-    """
-    forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
-    attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name]
-    kv_cache = attn_layer.kv_cache[forward_context.virtual_engine]
-    return attn_metadata, attn_layer, kv_cache
-
-
 class Attention(nn.Module, AttentionLayerBase):
     """Attention layer.
 
@@ -907,6 +877,36 @@ def maybe_calc_kv_scales_fake(
 )
 
 
+def get_attention_context(
+    layer_name: str,
+) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]:
+    """Extract attention context for a given layer.
+
+    This helper function extracts the attention metadata, attention layer
+    instance, and KV cache tensor for a specific layer.
+
+    Args:
+        layer_name: The name/identifier of the attention layer.
+
+    Returns:
+        A tuple containing:
+        - attn_metadata: Attention metadata for this specific layer, or None if
+          no metadata available
+        - attn_layer: The attention layer instance (Attention or MLAAttention)
+        - kv_cache: The KV cache tensor for current virtual engine
+
+        Note: attn_metadata may be None, but attn_layer and kv_cache are always
+        extracted from the forward context.
+    """
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if isinstance(attn_metadata, dict):
+        attn_metadata = attn_metadata[layer_name]
+    attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name]
+    kv_cache = attn_layer.kv_cache[forward_context.virtual_engine]
+    return attn_metadata, attn_layer, kv_cache
+
+
 @maybe_transfer_kv_layer
 def unified_attention(
     query: torch.Tensor,
diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/attention/utils/kv_transfer_utils.py
@@ -3,17 +3,13 @@
 import inspect
 from collections.abc import Callable
 from functools import wraps
-from typing import TYPE_CHECKING
 
 from vllm.distributed.kv_transfer import (
     get_kv_transfer_group,
     has_kv_transfer_group,
     is_v1_kv_transfer_group,
 )
 
-if TYPE_CHECKING:
-    pass
-
 
 def maybe_transfer_kv_layer(func: Callable) -> Callable:
     """Decorator that handles KV layer transfer prior and after execution of