Move float16 typecast hack to gptq marlin moe method

ElizaWszola · ElizaWszola · commit 8886423085ba · 2024-09-07T06:30:32.000-04:00
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -584,6 +584,9 @@ def apply(
         from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             fused_marlin_moe)
 
+        # The input must currently be float16
+        x = x.half()
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -99,7 +99,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states.half(), router_logits)
+        final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape).to(orig_dtype)