[Misc][VIP] hotfix for gptq-marlin non-contiguous error (#9)

DefTruth · web-flow · commit bb218a62a331 · 2025-03-24T13:50:16.000+08:00
Signed-off-by: DefTruth &lt;qiustudent_r@163.com&gt;
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -738,6 +738,13 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      use_atomic_add: bool = False,
                      use_fp32_reduce: bool = False,
                      is_zp_float: bool = False) -> torch.Tensor:
+    # FIXME(DefTruth): Remove this patch once gptq_marlin_gemm
+    # supports non-contiguous input. Currently, marlin requires
+    # contiguous memory layout, but prefix cache may cause `a`
+    # to be non-contiguous. We should lower the non-contiguous
+    # fix into the this function, since `gptq_marlin_gemm` has
+    # been used in multiple code paths, both AWQ and GPTQ.
+    a = a.contiguous()  # no-op if already contiguous
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, b_q_type.id,
                                          size_m, size_n, size_k, is_k_full,
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
@@ -1161,7 +1161,7 @@ def _compute_prefill_context(
             k_pe = workspace[:toks]\
                 [..., self.kv_lora_rank:].unsqueeze(1)
 
-            kv_nope = self.kv_b_proj(kv_c_normed.contiguous())[0].view( \
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
                 -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
             k_nope, v = kv_nope\
                 .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -115,10 +115,6 @@ def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # marlin requires contiguous memory layout
-        # prefix caching may cause x to be non-contiguous
-        x = x.contiguous()  # no-op if already contiguous
-
         c = self.config
         w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)