pytorch · wwwjn · Dec 11, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 9, 2025
@@ -6,7 +6,6 @@
 #
 # Copyright (c) Meta Platforms, Inc. All Rights Reserved.
 
-import functools
 from collections.abc import Callable
 from typing import ClassVar, NamedTuple
 
@@ -235,18 +234,7 @@ def blocked_mask_mod(
 
 
 def get_sliding_window_mask_mod(window_size: int) -> _mask_mod_signature:
-    """Creates a sliding window mask that only attends to tokens within a fixed window size.
-
-    This implements causal sliding window attention where each token can only attend to:
-    - Itself (current token)
-    - Up to `window_size - 1` previous tokens
-    Args:
-        window_size: The maximum number of tokens to attend to (including current token).
-                    Must be >= 1. A window_size of 1 means attend only to self.
-
-    Returns:
-        A mask modifier function that implements causal sliding window masking.
-    """
+    """Creates a sliding window mask that only attends to tokens within a fixed window size"""
 
     if window_size < 1:
         raise ValueError(
@@ -268,13 +256,8 @@ def sliding_window_mod(
 _compiled_create_block_mask = torch.compile(create_block_mask)
 
 
-@functools.lru_cache(4)
 def create_attention_mask(*args, **kwargs):
-    """Create an attention mask using compiled create_block_mask.
-
-    This function is cached to avoid recreating BlockMasks for the same
-    arguments.
-    """
+    """Create an attention mask using compiled create_block_mask."""
     return _compiled_create_block_mask(*args, **kwargs)