luo-cheng2021
diff --git a/‎vllm/model_executor/layers/sampler.py‎
Lines changed: 14 additions & 6 deletions b/‎vllm/model_executor/layers/sampler.py‎
Lines changed: 14 additions & 6 deletions
@@ -1,6 +1,7 @@
 """A layer that samples the next tokens from the model's outputs."""
 from typing import Dict, List, Optional, Tuple
 
+import time
 import torch
 import torch.nn as nn
 
@@ -37,13 +38,20 @@ def forward(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
         embedding_bias: Optional[torch.Tensor] = None,
+        logits: Optional[torch.Tensor] = None,
     ) -> Optional[SamplerOutput]:
-        # Get the hidden states that we use for sampling.
-        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
-
-        # Get the logits for the next tokens.
-        logits = _get_logits(hidden_states, embedding, embedding_bias,
-                             self.vocab_size)
+        if logits is None:
+            # Get the hidden states that we use for sampling.
+            hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+
+            start = time.time()
+            # Get the logits for the next tokens.
+            logits = _get_logits(hidden_states, embedding, embedding_bias,
+                                self.vocab_size)
+            end = time.time()
+            print(f'Out-of-model logits calculation (MatMul) took {(end - start)*1000} ms')
+        else:
+            logits = _prune_hidden_states(logits, sampling_metadata)
 
         # Only perform sampling in the driver worker.
         # Note: `_get_logits` is still distributed across TP workers because