Use property

wangxiyuan · wangxiyuan · commit a50f42946136 · 2025-01-14T14:27:08.000+08:00
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
@@ -16,8 +16,6 @@
 
 class MockAttentionBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         raise NotImplementedError
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -34,7 +34,7 @@ class AttentionBackend(ABC):
     # For some attention backends, we allocate an output tensor before
     # calling the custom op. When piecewise cudagraph is enabled, this
     # makes sure the output tensor is allocated inside the cudagraph.
-    use_output: bool
+    accept_output_buffer: bool = False
 
     @staticmethod
     @abstractmethod
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
@@ -87,8 +87,6 @@ def __post_init__(self):
 
 class BlocksparseFlashAttentionBackend(AttentionBackend):
 
-    use_output: bool = True
-
     @staticmethod
     def get_name() -> str:
         # For attention layer compatibility
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -29,7 +29,7 @@
 
 class FlashAttentionBackend(AttentionBackend):
 
-    use_output: bool = True
+    accept_output_buffer: bool = True
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
@@ -40,8 +40,6 @@
 
 class FlashInferBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER"
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -22,8 +22,6 @@
 
 class HPUAttentionBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "HPU_ATTN"
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
@@ -17,8 +17,6 @@
 
 class IpexAttnBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "IPEX"
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
@@ -37,8 +37,6 @@ def create_roi_tensor(
 
 class OpenVINOAttentionBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "OPENVINO"
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
@@ -11,8 +11,6 @@
 
 class PallasAttentionBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "PALLAS"
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
@@ -21,8 +21,6 @@
 class PlaceholderAttentionBackend(AttentionBackend):
     """Placeholder backend for when no attention is needed."""
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "NO_ATTENTION"
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -29,8 +29,6 @@
 
 class ROCmFlashAttentionBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "ROCM_FLASH"
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -22,8 +22,6 @@
 
 class TorchSDPABackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "TORCH_SDPA"
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -24,8 +24,6 @@
 
 class XFormersBackend(AttentionBackend):
 
-    use_output: bool = False
-
     @staticmethod
     def get_name() -> str:
         return "XFORMERS"
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -110,7 +110,7 @@ def __init__(
         self.use_direct_call = not current_platform.is_cuda_alike(
         ) and not current_platform.is_cpu()
 
-        self.use_output = attn_backend.use_output
+        self.use_output = attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -15,6 +15,8 @@
 
 class FlashAttentionBackend(AttentionBackend):
 
+    accept_output_buffer: bool = True
+
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
@@ -46,10 +48,6 @@ def get_kv_cache_shape(
     def use_cascade_attention(*args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
-    @staticmethod
-    def use_output():
-        return True
-
 
 @dataclass
 class FlashAttentionMetadata: