Revert "[Kernel] unified_attention for Attention.forward (vllm-project#11967)"

menhguin · menhguin · commit e8dd15904249 · 2025-01-24T23:56:59.000+08:00
This reverts commit 0f8cafe.
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -148,10 +148,19 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
+<<<<<<< HEAD
         if self.calculate_kv_scales and \
             attn_metadata.enable_kv_scales_calculation:
             self.calc_kv_scales(key, value)
         if self.use_output:
+=======
+
+        if self.use_direct_call:
+            return self.impl.forward(query, key, value, kv_cache,
+                                     attn_metadata, self._k_scale,
+                                     self._v_scale)
+        elif self.use_output:
+>>>>>>> parent of 0f8cafe2 ([Kernel] unified_attention for Attention.forward (#11967))
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
             # Reshape the query, key, and value tensors.
@@ -163,19 +172,12 @@ def forward(
                 key = key.view(-1, self.num_kv_heads, self.head_size)
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
-            if self.use_direct_call:
-                unified_attention_with_output(query, key, value, output,
-                                              self.layer_name)
-            else:
-                torch.ops.vllm.unified_attention_with_output(
-                    query, key, value, output, self.layer_name)
+            torch.ops.vllm.unified_attention_with_output(
+                query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
-            if self.use_direct_call:
-                return unified_attention(query, key, value, self.layer_name)
-            else:
-                return torch.ops.vllm.unified_attention(
-                    query, key, value, self.layer_name)
+            return torch.ops.vllm.unified_attention(query, key, value,
+                                                    self.layer_name)
 
     def calc_kv_scales(self, key, value):
         self._k_scale.copy_(torch.abs(key).max() / self.k_range)
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -2183,6 +2183,7 @@ def bind_kv_cache(
         forward_ctx = ctx[layer_name]
         assert len(forward_ctx.kv_cache) == len(kv_cache)
         for ve, ve_kv_cache in enumerate(kv_cache):
+            assert forward_ctx.kv_cache[ve].numel() == 0
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
 
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -29,7 +29,6 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.distributed.parallel_state import get_world_group
-from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -42,8 +41,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (bind_kv_cache, is_pin_memory_available,
-                        make_tensor_with_pad)
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -1300,9 +1298,6 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        bind_kv_cache(
-            self.vllm_config.compilation_config.static_forward_context,
-            [kv_caches])
         max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
         max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
                              self.scheduler_config.max_num_seqs)
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
@@ -22,7 +22,6 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import bind_kv_cache
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.hpu_model_runner import HPUModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
@@ -282,8 +281,6 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      self.hpu_cache)
 
     def _warm_up_model(self) -> None:
         # NOTE(kzawora): We should use virtual engine index here
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
@@ -8,7 +8,6 @@
 from transformers_neuronx.config import GenerationConfig
 
 from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -318,15 +317,13 @@ def execute_model(
             raise ValueError(
                 "NeuronModelRunner does not support multi-step execution.")
 
-        with set_forward_context(None, self.vllm_config, 0):
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device),
-            )
+        hidden_states = self.model(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            input_block_ids=model_input.input_block_ids,
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
+        )
 
         # Compute the logits only if the on-device sampling is turned off as
         # on-device sampling outputs the token ids.
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
@@ -8,7 +8,6 @@
 from vllm.attention import get_attn_backend
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
 from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -355,8 +354,7 @@ def execute_model(
                                          device=self.device),
         }
 
-        with set_forward_context(attn_metadata, self.vllm_config, 0):
-            hidden_states = model_executable(**execute_model_kwargs)
+        hidden_states = model_executable(**execute_model_kwargs)
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
@@ -21,7 +21,6 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
-from vllm.utils import bind_kv_cache
 from vllm.worker.openvino_model_runner import OpenVINOModelRunner
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
 
@@ -339,8 +338,6 @@ def _init_cache_engine(self) -> None:
             ov_device,
         )
         self.kv_cache = self.cache_engine.kv_cache
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      [self.kv_cache])
         self.model_runner.block_size = self.cache_engine.block_size
 
         assert self.kv_cache is not None
@@ -512,18 +509,12 @@ def model_profile_run():
 
             self.model_runner.block_size = tmp_cache_config.block_size
 
-            bind_kv_cache(self.compilation_config.static_forward_context,
-                          profiling_cache_engine.kv_cache)
             # Run the model with the dummy inputs.
             self.model_runner.execute_model(seqs,
                                             profiling_cache_engine.kv_cache)
 
-            # Explicitly revert bind_kv_cache and delete temporary KV cache
-            # manager to free KV cache when real inputs will be passed to OV
-            bind_kv_cache(self.compilation_config.static_forward_context, [[
-                torch.tensor([])
-                for _ in range(len(profiling_cache_engine.kv_cache))
-            ]])
+            # explicitly delete temporary KV cache manager to free KV cache
+            # when real inputs will be passed to OV
             del profiling_cache_engine
 
             logger.info(
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
@@ -13,7 +13,6 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
@@ -272,9 +271,8 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(t, 0)
             torch._dynamo.mark_dynamic(p, 0)
         # Dummy run.
-        with set_forward_context(attn_metadata, self.vllm_config, 0):
-            self.model(token_ids, position_ids, attn_metadata, input_lens, t,
-                       p, num_samples, kv_caches)
+        self.model(token_ids, position_ids, attn_metadata, input_lens, t, p,
+                   num_samples, kv_caches)
 
     def warmup_model(
         self,
@@ -673,13 +671,10 @@ def execute_model(
                 input_lens = model_input.input_lens[i:i + 1].to(self.device)
                 t = model_input.t[i:i + 1].to(self.device)
                 p = model_input.p[i:i + 1].to(self.device)
-                with set_forward_context(model_input.attn_metadata,
-                                         self.vllm_config,
-                                         model_input.virtual_engine):
-                    output_token_ids = self.model(token_ids, position_ids,
-                                                  attn_metadata, input_lens, t,
-                                                  p, model_input.num_samples,
-                                                  kv_caches)
+                output_token_ids = self.model(token_ids, position_ids,
+                                              attn_metadata, input_lens, t, p,
+                                              model_input.num_samples,
+                                              kv_caches)
                 next_token_ids.append(output_token_ids[0])
                 start_idx = end_idx
 
@@ -724,13 +719,10 @@ def execute_model(
             input_lens = model_input.input_lens.to(self.device)
             for i in range(num_steps):
                 slot_mapping = attn_metadata.slot_mapping
-                with set_forward_context(model_input.attn_metadata,
-                                         self.vllm_config,
-                                         model_input.virtual_engine):
-                    output_token_ids = self.model(token_ids, position_ids,
-                                                  attn_metadata, input_lens, t,
-                                                  p, model_input.num_samples,
-                                                  kv_caches)
+                output_token_ids = self.model(token_ids, position_ids,
+                                              attn_metadata, input_lens, t, p,
+                                              model_input.num_samples,
+                                              kv_caches)
                 self.cached_step_outputs.append(output_token_ids)
 
                 if i < num_steps - 1:
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
@@ -12,7 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
 from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
@@ -108,8 +108,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                       torch.tensor([], dtype=torch.float32,
                                    device=self.device))
                      for _ in range(num_layers)]
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      [kv_caches])
         self.model_runner._dummy_run(
             batch_size=1,
             seq_len=self.scheduler_config.max_num_batched_tokens,
@@ -172,8 +170,6 @@ def initialize_cache(
                                       device="cpu")
             cpu_v_cache = torch.zeros_like(cpu_k_cache)
             self.cpu_cache.append((cpu_k_cache, cpu_v_cache))
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      [self.tpu_cache])
         self._warmup_model()
 
     def _warmup_model(self) -> None:
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
@@ -12,7 +12,6 @@
 from vllm.attention import get_attn_backend
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadataCache
@@ -574,17 +573,15 @@ def execute_model(
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start_time = time.time()
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 model_input.virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device))
+
+        hidden_or_intermediate_states = model_executable(
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device))
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             return hidden_or_intermediate_states