Move output type conversion to gptq method as well

ElizaWszola · dsikka · commit 970e06a77a02 · 2024-09-09T01:41:46.000Z
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -585,6 +585,7 @@ def apply(
             fused_marlin_moe)
 
         # The input must currently be float16
+        orig_dtype = x.dtype
         x = x.half()
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -610,4 +611,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
-        )
+        ).to(orig_dtype)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -95,12 +95,11 @@ def __init__(self,
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
-        orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape).to(orig_dtype)
+        return final_hidden_states.view(orig_shape)
 
 
 class MixtralAttention(nn.Module):