From 0abac6f6f01ff4c1fc275c13702e3269b5a9dfe3 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:07:15 -0400
Subject: [PATCH 01/46] Enable 8-bit weights in Fused Marlin MoE

---
 csrc/moe/marlin_moe_ops.cu                    | 301 ++++++++++++------
 csrc/moe/marlin_moe_ops.h                     |   9 +-
 csrc/moe/torch_bindings.cpp                   |  11 +-
 tests/kernels/test_moe.py                     | 225 ++++++++++++-
 vllm/_custom_ops.py                           |   2 +-
 .../layers/fused_moe/__init__.py              |  16 +-
 .../layers/fused_moe/fused_moe.py             | 133 ++------
 .../layers/fused_moe/fused_moe_marlin.py      | 245 ++++++++++++++
 .../compressed_tensors_moe.py                 |  33 +-
 .../layers/quantization/utils/marlin_utils.py |  17 +
 .../quantization/utils/marlin_utils_test.py   |  11 +-
 .../layers/quantization/utils/quant_utils.py  |  19 +-
 12 files changed, 775 insertions(+), 247 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index 1e170e80d2f7..e3c18ce5a50b 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -25,6 +25,8 @@
 
 #include <iostream>
 
+#include "core/scalar_type.hpp"
+
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -131,11 +133,26 @@ __device__ inline int lop3(int a, int b, int c) {
   return res;
 }
 
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant(int q) {
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <vllm::ScalarTypeId w_type_id>
+__device__ inline FragB dequant(int q);
+
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+template <>
+__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int LO = 0x000f000f;
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
@@ -156,6 +173,28 @@ __device__ inline FragB dequant(int q) {
   return frag_b;
 }
 
+// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
+// Reference:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+template <>
+__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
@@ -296,7 +335,8 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   __syncthreads();
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -331,6 +371,9 @@ __device__ inline void MarlinMoESingle(
     bool apply_weights,    // apply weights to output
     int current_m_block    // current m block to start kernel computation from
 ) {
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  constexpr int pack_factor = 32 / w_type.size_bits();
+
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -423,12 +466,15 @@ __device__ inline void MarlinMoESingle(
   constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
   constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
@@ -465,12 +511,12 @@ __device__ inline void MarlinMoESingle(
       a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
   a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
 
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -571,7 +617,7 @@ __device__ inline void MarlinMoESingle(
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
+  I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
   FragS frag_s[2][4];         // No act-order
   FragS act_frag_s[2][4][4];  // For act-order
@@ -637,7 +683,10 @@ __device__ inline void MarlinMoESingle(
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
         B_ptr[i] += b_gl_rd_delta_o;
       }
 
@@ -715,8 +764,12 @@ __device__ inline void MarlinMoESingle(
     for (int i = 0; i < thread_m_blocks; i++)
       ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
   };
 
   bool is_same_group[stages];
@@ -840,10 +893,19 @@ __device__ inline void MarlinMoESingle(
   // dequantization and matmul operations.
   #pragma unroll
     for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
+      int b_quant_0, b_quant_1;
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k % 2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
 
-      FragB frag_b0 = dequant(b_quant);
+      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
+      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
 
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
@@ -855,8 +917,6 @@ __device__ inline void MarlinMoESingle(
         }
       }
 
-      FragB frag_b1 = dequant(b_quant_shift);
-
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
@@ -881,13 +941,13 @@ __device__ inline void MarlinMoESingle(
   // multiple warps that accumulate their partial sums of the same output
   // location; which we have to reduce over in the end. We do in shared memory.
   auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
 
       // Parallel logarithmic shared memory reduction. We make sure to avoid any
       // unnecessary read or write iterations, e.g., for two warps we write only
@@ -1035,8 +1095,10 @@ __device__ inline void MarlinMoESingle(
     auto write = [&](int idx, float c0, float c1, FragS& s) {
       half2 res = __halves2half2(__float2half(c0), __float2half(c1));
 
-      // For per-column quantization we finally apply the scale here
-      if constexpr (!has_act_order && group_blocks == -1) {
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4) {
         res = __hmul2(res, s[0]);
       }
 
@@ -1169,25 +1231,67 @@ __device__ inline void MarlinMoESingle(
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (w_type.size_bits() == 8) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
           cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
+            cp_async_fence();
+          }
         }
       }
 
       thread_block_reduce();
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (w_type.size_bits() == 8) {
           cp_async_wait<0>();
           __syncthreads();
           if (threadIdx.x / 32 < thread_n_blocks / 4) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
           }
+
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+            }
+          }
         }
       }
+
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
@@ -1227,7 +1331,8 @@ __device__ inline void MarlinMoESingle(
   }
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1293,29 +1398,29 @@ __global__ void MarlinMoE(
   }
 
   if (max_block == 1) {
-    MarlinMoESingle<threads, 1, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 2) {
-    MarlinMoESingle<threads, 2, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 3) {
-    MarlinMoESingle<threads, 3, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else {
-    MarlinMoESingle<threads, 4, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
@@ -1342,7 +1447,8 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   return;
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1397,19 +1503,20 @@ const int STAGES = 4;  // 4 pipeline stages fit into shared memory
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 
-#define __CALL_IF_MOE(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
-                      HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)               \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                              \
+#define __CALL_IF_MOE(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
+                      THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS,           \
+                      NUM_THREADS)                                            \
+  else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&          \
            thread_n_blocks == THREAD_N_BLOCKS &&                              \
            thread_k_blocks == THREAD_K_BLOCKS &&                              \
            has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
            num_threads == NUM_THREADS) {                                      \
     cudaFuncSetAttribute(                                                     \
-        MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,              \
+        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
                   THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
+    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,     \
+              THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>           \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
             A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
             g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
@@ -1494,42 +1601,43 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
   return thread_config_t{-1, -1, -1};
 }
 
-#define CALL_IF_MOE(N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                               \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
                          const void* topk_ids, const void* s, const void* g_idx,
                          const void* perm, void* a_tmp, void* expert_offsets,
                          int prob_m, int prob_n, int prob_k, void* workspace,
-                         bool has_act_order, bool is_k_full, int num_groups,
-                         int group_size, int num_experts, int topk,
-                         int moe_block_size, int dev, cudaStream_t stream,
-                         int thread_k, int thread_n, int sms, int max_par,
-                         bool replicate_input, bool apply_weights) {
+                         vllm::ScalarType const& q_type, bool has_act_order,
+                         bool is_k_full, int num_groups, int group_size,
+                         int num_experts, int topk, int moe_block_size, int dev,
+                         cudaStream_t stream, int thread_k, int thread_n,
+                         int sms, int max_par, bool replicate_input,
+                         bool apply_weights) {
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -1611,10 +1719,13 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
     has_act_order = false;
   }
 
+  int pack_factor = 32 / q_type.size_bits();
+
   for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
     const int4* A_ptr = (const int4*)A;
     int4* a_tmp_ptr = (int4*)a_tmp;
-    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
+    const int4* B_ptr =
+        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
     int4* C_ptr = (int4*)C;
     const float* topk_weights_ptr = (const float*)topk_weights;
     const int* sorted_ids_ptr = (const int*)sorted_ids;
@@ -1645,10 +1756,14 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 
       if (false) {
       }
-      CALL_IF_MOE(16, 4, 256)
-      CALL_IF_MOE(8, 8, 256)
-      CALL_IF_MOE(8, 4, 128)
-      CALL_IF_MOE(4, 8, 128)
+      CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
+      CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
+      CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
+      CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
+      CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
+      CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
+      CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
+      CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
       else {
         TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
                                str(prob_n) + ", " + str(prob_k) + "]" +
@@ -1670,9 +1785,15 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights) {
+  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+              "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+
+  int pack_factor = 32 / b_q_type->size_bits();
+
   int max_par = 4;
 
   int dev = a.get_device();
@@ -1733,8 +1854,8 @@ torch::Tensor marlin_gemm_moe(
       topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
       g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
       expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
-      has_act_order, is_k_full, num_groups, group_size, num_experts, topk,
-      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      *b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts,
+      topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
-}
\ No newline at end of file
+}
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index 01ba8ff69850..adee8399a4d6 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -2,11 +2,14 @@
 
 #include <torch/all.h>
 
+#include "core/scalar_type.hpp"
+
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
-    bool replicate_input, bool apply_weights);
\ No newline at end of file
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d4d43e2c601b..d2352375de33 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -9,16 +9,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "token_expert_indices, Tensor gating_output) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
-#ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
-      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
-      "bool replicate_input, bool apply_weights) -> Tensor");
-
+      "g_idx, Tensor! perm, Tensor! workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
+      "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
-#endif
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index f526c381b333..f7642bf02b05 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,6 +2,8 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+from typing import List
+
 import pytest
 import torch
 from transformers import MixtralConfig
@@ -9,7 +11,12 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+    fused_moe_marlin, single_moe_marlin)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.scalar_type import scalar_types
 
 
 def torch_moe(a, w1, w2, score, topk):
@@ -29,6 +36,20 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
+def torch_moe_single(a, w, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    _, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = a[mask] @ w[i].transpose(0, 1)
+    return (out.view(B, -1, w.shape[1])).sum(dim=1)
+
+
 @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -43,11 +64,11 @@ def test_fused_moe(
     topk: int,
     dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
@@ -99,3 +120,199 @@ def test_mixtral_moe(dtype: torch.dtype):
                                vllm_states,
                                rtol=mixtral_moe_tol[dtype],
                                atol=mixtral_moe_tol[dtype])
+
+
+def stack_and_dev(tensors: List[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+    num_bits: int,
+):
+    torch.manual_seed(7)
+
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+
+    quant_type = (scalar_types.uint4b8
+                  if num_bits == 4 else scalar_types.uint8b128)
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    for i in range(w2.shape[0]):
+        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+            w1[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref1_l.append(w_ref1)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l)
+    sort_indices1 = stack_and_dev(sort_indices1_l)
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+            w2[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref2_l.append(w_ref2)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l)
+    sort_indices2 = stack_and_dev(sort_indices2_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    triton_output = fused_moe(
+        a,
+        w_ref1.transpose(1, 2).contiguous(),
+        w_ref2.transpose(1, 2).contiguous(),
+        score,
+        topk,
+        renormalize=False,
+    )
+    marlin_output = fused_moe_marlin(
+        a,
+        qweight1,
+        qweight2,
+        score,
+        g_idx1,
+        g_idx2,
+        sort_indices1,
+        sort_indices2,
+        topk,
+        renormalize=False,
+        w1_scale=scales1,
+        w2_scale=scales2,
+        num_bits=num_bits,
+    )
+
+    assert compute_max_diff(marlin_output, triton_output) < 4e-2
+
+
+@pytest.mark.skip("This test is here for the sake of debugging, "
+                  "don't run it in automated tests.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_marlin_moe_mmm(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+    num_bits: int,
+):
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == k:
+            return
+
+    quant_type = (scalar_types.uint4b8
+                  if num_bits == 4 else scalar_types.uint8b128)
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
+
+    w_ref_l = []
+    qweights_l = []
+    scales_l = []
+    g_idx_l = []
+    sort_indices_l = []
+
+    for i in range(w.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
+        w_ref_l.append(w_ref)
+        qweights_l.append(qweight)
+        scales_l.append(scales)
+        g_idx_l.append(g_idx)
+        sort_indices_l.append(sort_indices)
+
+    w_ref = stack_and_dev(w_ref_l)
+    qweight = stack_and_dev(qweights_l).contiguous()
+    scales = stack_and_dev(scales_l)
+    g_idx = stack_and_dev(g_idx_l)
+    sort_indices = stack_and_dev(sort_indices_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    marlin_output = single_moe_marlin(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      g_idx,
+                                      sort_indices,
+                                      topk,
+                                      renormalize=False,
+                                      num_bits=num_bits)
+    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+
+    assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index fe254732e730..51db8b34e291 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -314,7 +314,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
-    output = torch.empty((num_experts, size_k // 16, size_n * 2),
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index fd6f41b90042..65a9b78a118c 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,17 +1,23 @@
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
-__all__ = ["FusedMoE", "FusedMoEMethodBase", "FusedMoeWeightScaleSupported"]
+__all__ = [
+    "FusedMoE",
+    "FusedMoEMethodBase",
+    "FusedMoeWeightScaleSupported",
+    "fused_moe_marlin",
+    "single_moe_marlin",
+]
 
 if HAS_TRITON:
-
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_marlin_moe, fused_moe, fused_topk,
-        get_config_file_name, grouped_topk)
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
 
     __all__ += [
-        "fused_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index d2b152320e11..613d67e64bff 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -323,15 +323,22 @@ def get_moe_configs(E: int, N: int,
     return None
 
 
-def get_default_config(M: int, E: int, N: int, K: int, topk: int,
-                       dtype: Optional[str],
-                       is_marlin: bool) -> Dict[str, int]:
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+) -> Dict[str, int]:
     config = {
         'BLOCK_SIZE_M': 64,
         'BLOCK_SIZE_N': 64,
         'BLOCK_SIZE_K': 32,
         'GROUP_SIZE_M': 8
     }
+    # A heuristic: fused marlin works faster with this config for small M
     if M <= E or (is_marlin and M <= 32):
         config = {
             'BLOCK_SIZE_M': 16,
@@ -342,14 +349,15 @@ def get_default_config(M: int, E: int, N: int, K: int, topk: int,
     return config
 
 
-def try_get_optimal_moe_config(w1_shape: Tuple[int, ...],
-                               w2_shape: Tuple[int, ...],
-                               top_k: int,
-                               dtype: Optional[str],
-                               M: int,
-                               override_config: Optional[Dict[str,
-                                                              Any]] = None,
-                               is_marlin: bool = False):
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+    is_marlin: bool = False,
+):
     if override_config:
         config = override_config
     else:
@@ -391,6 +399,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
+
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -437,108 +446,6 @@ def grouped_topk(hidden_states: torch.Tensor,
     return topk_weights, topk_ids
 
 
-def fused_marlin_moe(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     gating_output: torch.Tensor,
-                     g_idx1: torch.Tensor,
-                     g_idx2: torch.Tensor,
-                     rand_perm1: torch.Tensor,
-                     rand_perm2: torch.Tensor,
-                     topk: int,
-                     renormalize: bool,
-                     override_config: Optional[Dict[str, Any]] = None,
-                     use_fp8: bool = False,
-                     w1_scale: Optional[torch.Tensor] = None,
-                     w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    This function computes a Mixture of Experts (MoE) layer using two sets of
-    weights, w1 and w2, and top-k gating mechanism.
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w1 (torch.Tensor): The first set of expert weights.
-    - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk (int): The number of top-k experts to select.
-    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override
-        for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
-    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w1.
-    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w2.
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-
-    #TODO fp8 is not implemented yet
-    assert not use_fp8
-
-    M, K = hidden_states.shape
-    E = w1.shape[0]
-    N = w2.shape[1] * 16
-
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
-
-    get_config_func = functools.partial(try_get_optimal_moe_config,
-                                        w1.shape,
-                                        w2.shape,
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
-                                        override_config=override_config,
-                                        is_marlin=True)
-    config = get_config_func(M)
-
-    block_size_m = config['BLOCK_SIZE_M']
-
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
-
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
-
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
-        g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk,
-        block_size_m, True, False)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
-        intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
-        w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
-        block_size_m, False, True)
-
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
-
-
 def get_config_dtype_str(dtype: torch.dtype,
                          use_int8_w8a16: Optional[bool] = False,
                          use_fp8_w8a8: Optional[bool] = False):
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
new file mode 100644
index 000000000000..40f9f66f1706
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -0,0 +1,245 @@
+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Any, Dict, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.scalar_type import scalar_types
+
+from .fused_moe import (fused_topk, moe_align_block_size,
+                        try_get_optimal_moe_config)
+
+
+def single_moe_marlin(
+    hidden_states: torch.Tensor,
+    w: torch.Tensor,
+    scales: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx: torch.Tensor,
+    rand_perm: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    num_bits: int = 8,
+) -> torch.Tensor:
+    """
+    This function computes a Marlin MoE MMM using weights w
+    and top-k gating mechanism. It is meant for testing and debugging.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w (torch.Tensor): The first set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        product for w. Defaults to False.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert num_bits in [4, 8]
+    # TODO support this
+    assert not use_fp8
+
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // (num_bits // 2)
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w.shape,
+                                        w.shape,
+                                        topk_ids.shape[1],
+                                        "float8" if use_fp8 else None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    scalar_type = (scalar_types.uint4b8
+                   if num_bits == 4 else scalar_types.uint8b128)
+
+    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
+        g_idx, rand_perm, workspace, scalar_type, M, N, K, True, E, topk,
+        block_size_m, True, False)
+
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+
+
+def fused_moe_marlin(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx1: torch.Tensor,
+    g_idx2: torch.Tensor,
+    rand_perm1: torch.Tensor,
+    rand_perm2: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    num_bits: int = 8,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2), "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert num_bits in [4, 8]
+    # TODO support this
+    assert not use_fp8
+
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        "float8" if use_fp8 else None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    scalar_type = (scalar_types.uint4b8
+                   if num_bits == 4 else scalar_types.uint8b128)
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states,
+        w1,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        g_idx1,
+        rand_perm1,
+        workspace,
+        scalar_type,
+        M,
+        2 * N,
+        K,
+        True,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+
+    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+        intermediate_cache2,
+        w2,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w2_scale,
+        g_idx2,
+        rand_perm2,
+        workspace,
+        scalar_type,
+        M,
+        K,
+        N,
+        True,
+        E,
+        topk,
+        block_size_m,
+        False,
+        True,
+    )
+
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 0e0ab9ce9169..ba4f719a3f97 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -266,18 +266,21 @@ def apply(self,
               num_expert_group: Optional[int] = None,
               topk_group: Optional[int] = None) -> torch.Tensor:
 
-        from vllm.model_executor.layers.fused_moe.fused_moe import (
-            fused_marlin_moe)
-
-        return fused_marlin_moe(x,
-                                layer.w13_weight_packed,
-                                layer.w2_weight_packed,
-                                router_logits,
-                                layer.w13_g_idx,
-                                layer.w2_g_idx,
-                                layer.w13_g_idx_sort_indices,
-                                layer.w2_g_idx_sort_indices,
-                                top_k,
-                                renormalize=renormalize,
-                                w1_scale=layer.w13_weight_scale,
-                                w2_scale=layer.w2_weight_scale)
+        from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+            fused_moe_marlin)
+
+        return fused_moe_marlin(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            top_k,
+            renormalize=renormalize,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            num_bits=self.num_bits,
+        )
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0ec68ac5b0f2..699d5f184414 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -176,6 +176,23 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
 def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 7d08ac6f8746..4a06c5d63d52 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,6 +1,6 @@
 """Utility functions used for tests and benchmarks"""
 
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import torch
@@ -92,8 +92,11 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
-                    act_order: bool):
+def marlin_quantize(w: torch.Tensor,
+                    quant_type: ScalarType,
+                    group_size: int,
+                    act_order: bool,
+                    test_perm: Optional[torch.Tensor] = None):
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -104,7 +107,7 @@ def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        w, quant_type, group_size, act_order)
+        w, quant_type, group_size, act_order, test_perm)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 33f24ff5d54d..bdfda31de852 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List
+from typing import List, Optional
 
 import numpy
 import torch
@@ -53,7 +53,10 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
+def permute_rows(q_w: torch.Tensor,
+                 w_ref: torch.Tensor,
+                 group_size: int,
+                 test_perm: Optional[torch.Tensor] = None):
     assert q_w.shape == w_ref.shape
 
     orig_device = q_w.device
@@ -64,7 +67,7 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
         g_idx[i] = i // group_size
 
     # Simulate act_order by doing a random permutation on K
-    rand_perm = torch.randperm(k_size)
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
 
     g_idx = g_idx[rand_perm].contiguous()
     q_w = q_w[rand_perm, :].contiguous()
@@ -164,8 +167,11 @@ def reshape_w(w):
     )
 
 
-def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
-                          group_size: int, act_order: bool):
+def gptq_quantize_weights(w: torch.Tensor,
+                          quant_type: ScalarType,
+                          group_size: int,
+                          act_order: bool,
+                          test_perm: Optional[torch.Tensor] = None):
     size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
@@ -186,7 +192,8 @@ def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
         ), "For act_order, groupsize = {} must be less than size_k = {}".format(
             group_size, size_k)
 
-        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size)
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
+                                                    test_perm)
 
     return w_ref, w_q, w_s, g_idx, rand_perm
 

From fdf69c2f5e6c4a3f5604d7a088abefd57a0a5508 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:36:33 -0400
Subject: [PATCH 02/46] fix rocm

---
 csrc/moe/torch_bindings.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d2352375de33..e4fce091d24a 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -9,6 +9,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "token_expert_indices, Tensor gating_output) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
+#ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
@@ -19,5 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
+#endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

From 4da163b45096fb24ec62f30a26b7ecd4750bea67 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:45:52 -0400
Subject: [PATCH 03/46] bad paste

---
 csrc/moe/torch_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index e4fce091d24a..cd65a8ee92b9 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "int moe_block_size, bool replicate_input, bool apply_weights)"
       " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
-}
 #endif
+}
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

From 21d2337a42e11fd16d9891b6bd959209b220aa16 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 30 Aug 2024 17:29:42 +0000
Subject: [PATCH 04/46] add test case; fix imports for tests

---
 tests/weight_loading/models.txt                          | 1 +
 vllm/model_executor/layers/fused_moe/__init__.py         | 8 ++++----
 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py | 5 ++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index cbe30305c14f..7deb2880145c 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -15,6 +15,7 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 65a9b78a118c..06bd2706d7e4 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,3 @@
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -8,16 +6,18 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "fused_moe_marlin",
-    "single_moe_marlin",
 ]
 
 if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
+    from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+        fused_moe_marlin, single_moe_marlin)
 
     __all__ += [
+        "fused_moe_marlin",
+        "single_moe_marlin",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 40f9f66f1706..40b409ebeb34 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -5,11 +5,10 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
 
-from .fused_moe import (fused_topk, moe_align_block_size,
-                        try_get_optimal_moe_config)
-
 
 def single_moe_marlin(
     hidden_states: torch.Tensor,

From 638777a35922dfecbce7866547f5096539187603 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 30 Aug 2024 20:12:47 +0000
Subject: [PATCH 05/46] fix to adapt custom_routin_function

---
 .../layers/fused_moe/fused_moe_marlin.py            | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 40b409ebeb34..8c49333f7c84 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -1,6 +1,6 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, Optional
 
 import torch
 
@@ -106,7 +106,8 @@ def fused_moe_marlin(
     rand_perm1: torch.Tensor,
     rand_perm2: torch.Tensor,
     topk: int,
-    renormalize: bool,
+    custom_routing_function: Optional[Callable] = None,
+    renormalize: bool = True,
     override_config: Optional[Dict[str, Any]] = None,
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
@@ -161,8 +162,12 @@ def fused_moe_marlin(
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    if custom_routing_function is None:
+        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                            renormalize)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize)
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,

From bd4b84d92bfb33c3456a73b8dd951490a2ce11b0 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 2 Sep 2024 03:04:07 -0400
Subject: [PATCH 06/46] Use select_experts to compute top_k tensors in fused
 moe

---
 tests/kernels/test_moe.py                      |  7 ++++++-
 .../layers/fused_moe/fused_moe_marlin.py       | 11 +++--------
 .../compressed_tensors_moe.py                  | 18 ++++++++++++++----
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index f7642bf02b05..2cfd76d1c780 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -11,6 +11,7 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
     fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -218,6 +219,9 @@ def test_fused_marlin_moe(
     sort_indices2 = stack_and_dev(sort_indices2_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+
     triton_output = fused_moe(
         a,
         w_ref1.transpose(1, 2).contiguous(),
@@ -235,7 +239,8 @@ def test_fused_marlin_moe(
         g_idx2,
         sort_indices1,
         sort_indices2,
-        topk,
+        topk_weights,
+        topk_ids,
         renormalize=False,
         w1_scale=scales1,
         w2_scale=scales2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 8c49333f7c84..45dead9740f4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -105,7 +105,8 @@ def fused_moe_marlin(
     g_idx2: torch.Tensor,
     rand_perm1: torch.Tensor,
     rand_perm2: torch.Tensor,
-    topk: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
     custom_routing_function: Optional[Callable] = None,
     renormalize: bool = True,
     override_config: Optional[Dict[str, Any]] = None,
@@ -161,13 +162,7 @@ def fused_moe_marlin(
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
-
-    if custom_routing_function is None:
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
-    else:
-        topk_weights, topk_ids = custom_routing_function(
-            hidden_states, gating_output, topk, renormalize)
+    topk = topk_ids.shape[1]
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 9632dbbae395..53769cb73153 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -5,7 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
@@ -272,6 +272,16 @@ def apply(
         from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
             fused_moe_marlin)
 
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
+
         return fused_moe_marlin(
             x,
             layer.w13_weight_packed,
@@ -281,10 +291,10 @@ def apply(
             layer.w2_g_idx,
             layer.w13_g_idx_sort_indices,
             layer.w2_g_idx_sort_indices,
-            top_k,
-            custom_routing_function=custom_routing_function,
+            topk_weights,
+            topk_ids,
             renormalize=renormalize,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             num_bits=self.num_bits,
-        )
\ No newline at end of file
+        )

From bef6b53fc2043f6e7de262f90b381797ee0574ad Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 3 Sep 2024 10:42:10 -0400
Subject: [PATCH 07/46] bring back fused_moe_marlin -> fused_marlin_moe

---
 tests/kernels/test_moe.py                                 | 8 ++++----
 vllm/model_executor/layers/fused_moe/__init__.py          | 8 ++++----
 .../{fused_moe_marlin.py => fused_marlin_moe.py}          | 4 ++--
 .../compressed_tensors/compressed_tensors_moe.py          | 6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)
 rename vllm/model_executor/layers/fused_moe/{fused_moe_marlin.py => fused_marlin_moe.py} (99%)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2cfd76d1c780..606997843982 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -11,9 +11,9 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -230,7 +230,7 @@ def test_fused_marlin_moe(
         topk,
         renormalize=False,
     )
-    marlin_output = fused_moe_marlin(
+    marlin_output = fused_marlin_moe(
         a,
         qweight1,
         qweight2,
@@ -309,7 +309,7 @@ def test_marlin_moe_mmm(
     sort_indices = stack_and_dev(sort_indices_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    marlin_output = single_moe_marlin(a,
+    marlin_output = single_marlin_moe(a,
                                       qweight,
                                       scales,
                                       score,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 06bd2706d7e4..e9b5703ca28b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -9,15 +9,15 @@
 ]
 
 if HAS_TRITON:
+    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
-    from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-        fused_moe_marlin, single_moe_marlin)
 
     __all__ += [
-        "fused_moe_marlin",
-        "single_moe_marlin",
+        "fused_marlin_moe",
+        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
similarity index 99%
rename from vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
rename to vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 45dead9740f4..5866c83cd9c8 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -10,7 +10,7 @@
 from vllm.scalar_type import scalar_types
 
 
-def single_moe_marlin(
+def single_marlin_moe(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
     scales: torch.Tensor,
@@ -96,7 +96,7 @@ def single_moe_marlin(
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
 
-def fused_moe_marlin(
+def fused_marlin_moe(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 53769cb73153..b14ef433d539 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -269,8 +269,8 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
-        from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-            fused_moe_marlin)
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            fused_marlin_moe)
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -282,7 +282,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function)
 
-        return fused_moe_marlin(
+        return fused_marlin_moe(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,

From db1f07e8639badced65c6b85f812567f83442a74 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 3 Sep 2024 11:03:53 -0400
Subject: [PATCH 08/46] GPTQ Fused MoE class

---
 .../layers/fused_moe/__init__.py              |   3 +-
 vllm/model_executor/layers/fused_moe/layer.py | 155 +++++++++++++++++-
 2 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e9b5703ca28b..7f27e2660db6 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,11 +1,12 @@
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, GPTQFusedMoE)
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "GPTQFusedMoE",
 ]
 
 if HAS_TRITON:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3df0b61a9ebe..9643642b9b53 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -498,4 +498,157 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
                 param_data[expert_id][idx] = loaded_weight
             # If we are in the row parallel case (down_proj)
             else:
-                param_data[expert_id] = loaded_weight
\ No newline at end of file
+                param_data[expert_id] = loaded_weight
+
+
+class GPTQFusedMoE(torch.nn.Module):
+    """GPTQFusedMoE layer for GPTQ MoE models.
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    w13) and RowParallelLinear weights (down_proj/ w2).
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to all all_reduce on the output of the layer
+        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.tp_size = (tp_size if tp_size is not None else
+                        get_tensor_model_parallel_world_size())
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.intermediate_size = intermediate_size
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+        assert (not use_grouped_topk and num_expert_group is None
+                and topk_group is None)
+
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedFusedMoEMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix)
+        assert self.quant_method is not None
+
+        self.quant_method.create_weights(
+            layer=self,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=self.intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: str, expert_id: int) -> None:
+
+        if ("_qweight" in weight_name or "_scales" in weight_name
+                or "_qzeros" in weight_name):
+            if "w13" in weight_name:
+                shard_size = loaded_weight.size()[-1]
+                if shard_id == "w1":
+                    param.data[expert_id, :, :shard_size] = loaded_weight
+                elif shard_id == "w2" or shard_id == "w3":
+                    param.data[expert_id, :, shard_size:] = loaded_weight
+                else:
+                    raise ValueError(f"Invalid shard_id: {shard_id}: "
+                                     "must be w1, w2, or w3.")
+            elif "w2" in weight_name:
+                param.data[expert_id][:] = loaded_weight
+            else:
+                raise ValueError(f"Invalid weight name: {weight_name}: "
+                                 "must contain 'w13' or 'w2'.")
+        elif "_g_idx" in weight_name:
+            if "w13" not in weight_name and "w2" not in weight_name:
+                raise ValueError(f"Invalid weight name: {weight_name}: "
+                                 "must contain 'w13' or 'w2'.")
+            param.data[expert_id] = loaded_weight
+        else:
+            raise ValueError(f"Invalid weight name: {weight_name}.")
+
+    @staticmethod
+    def select_experts(hidden_states: torch.Tensor,
+                       router_logits: torch.Tensor,
+                       top_k: int,
+                       use_grouped_topk: bool,
+                       renormalize: bool,
+                       topk_group: Optional[int] = None,
+                       num_expert_group: Optional[int] = None):
+        assert (not use_grouped_topk and topk_group is None
+                and num_expert_group is None)
+        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+
+        topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
+                                            gating_output=router_logits,
+                                            topk=top_k,
+                                            renormalize=renormalize)
+
+        return topk_weights, topk_ids
+
+    def forward(self, hidden_states: torch.Tensor,
+                router_logits: torch.Tensor):
+        assert self.quant_method is not None
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=False,
+            topk_group=False,
+            num_expert_group=False)
+
+        if self.reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int) -> List[Tuple[str, str, int, str]]:
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_" if weight_name
+             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
+             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
+            for expert_id in range(num_experts) for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]

From 6753789bbe7e636a51a7a2adca10a24968bf76f1 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 3 Sep 2024 12:41:52 -0400
Subject: [PATCH 09/46] Add GPTQMarlinMoEMethod to gptq_marlin.py

---
 .../layers/quantization/gptq_marlin.py        | 304 +++++++++++++++++-
 1 file changed, 289 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 94eb3f301541..1588b2a6113a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,18 +1,25 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase,
+                                                        GPTQFusedMoE)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
-    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
+    marlin_permute_scales, marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx, replace_tensor, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -33,8 +40,14 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -109,11 +122,14 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
+        elif isinstance(layer, GPTQFusedMoE):
+            return GPTQMarlinMoEMethod(self)
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -179,7 +195,8 @@ def create_weights(
             output_size_per_partition=output_size_per_partition,
             input_size_per_partition=input_size_per_partition,
             input_size=input_size,
-            group_size=group_size)
+            group_size=group_size,
+        )
 
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
@@ -299,7 +316,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -308,7 +326,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=(layer.input_size if self.quant_config.desc_act else
                     layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
@@ -329,4 +348,259 @@ def apply(
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
-            bias=bias)
+            bias=bias,
+        )
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_tensor(layer, "w13_g_idx_sort_indices",
+                           w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices",
+                           w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=(layer.intermediate_size if self.quant_config.desc_act else
+                    layer.intermediate_size_per_partition),
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w2_scales", marlin_w2_scales)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=None)
+
+        return fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            topk_weights,
+            topk_ids,
+            renormalize=renormalize,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            num_bits=self.quant_config.quant_type.size_bits,
+        )

From 7df4014ce516363202cf3646a9c0598fb9cdeed8 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 09:00:15 -0400
Subject: [PATCH 10/46] Use FusedMoE layer for all loads

---
 .../layers/fused_moe/__init__.py              |   3 +-
 vllm/model_executor/layers/fused_moe/layer.py | 172 ++----------------
 .../layers/quantization/gptq_marlin.py        |   5 +-
 3 files changed, 22 insertions(+), 158 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 7f27e2660db6..e9b5703ca28b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,12 +1,11 @@
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, GPTQFusedMoE)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "GPTQFusedMoE",
 ]
 
 if HAS_TRITON:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 9643642b9b53..b0d7d4b538df 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -334,6 +334,25 @@ def weight_loader(self, param: torch.nn.Parameter,
             loaded_weight = loaded_weight.t().contiguous()
             shard_dim = ~shard_dim
 
+        # GPTQ Values
+        if ("scales" in weight_name or "qweight" in weight_name
+                or "qzeros" in weight_name):
+            if (shard_id == "w1" or shard_id == "w3"):
+                shard_dim = 1 - shard_dim
+            self._load_model_weight_or_group_weight_scale(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank)
+            return
+
+        if "g_idx" in weight_name:
+            self._load_single_value(param=param,
+                                    loaded_weight=loaded_weight,
+                                    expert_id=expert_id)
+            return
+
         # Case weight_scales
         if "weight_scale" in weight_name:
             # load the weight scaling based on the quantization scheme
@@ -499,156 +518,3 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
             # If we are in the row parallel case (down_proj)
             else:
                 param_data[expert_id] = loaded_weight
-
-
-class GPTQFusedMoE(torch.nn.Module):
-    """GPTQFusedMoE layer for GPTQ MoE models.
-    This layer contains both MergedColumnParallel weights (gate_up_proj / 
-    w13) and RowParallelLinear weights (down_proj/ w2).
-    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
-    copy that naming convention here and handle any remapping in the
-    load_weights function in each model implementation.
-    Args:
-        num_experts: Number of experts in the model
-        top_k: Number of experts selected for each token
-        hidden_size: Input hidden state size of the transformer
-        intermediate_size: Intermediate size of the experts
-        params_dtype: Data type for the parameters.
-        reduce_results: Whether to all all_reduce on the output of the layer
-        renomalize: Whether to renormalize the logits in the fused_moe kernel
-        quant_config: Quantization configure.
-    """
-
-    def __init__(
-        self,
-        num_experts: int,
-        top_k: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        reduce_results: bool = False,
-        renormalize: bool = True,
-        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
-        topk_group: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        self.tp_size = (tp_size if tp_size is not None else
-                        get_tensor_model_parallel_world_size())
-        self.top_k = top_k
-        self.num_experts = num_experts
-        self.intermediate_size = intermediate_size
-        self.intermediate_size_per_partition = intermediate_size // self.tp_size
-        self.reduce_results = reduce_results
-        self.renormalize = renormalize
-        assert (not use_grouped_topk and num_expert_group is None
-                and topk_group is None)
-
-        if quant_config is None:
-            self.quant_method: Optional[
-                QuantizeMethodBase] = UnquantizedFusedMoEMethod()
-        else:
-            self.quant_method = quant_config.get_quant_method(self, prefix)
-        assert self.quant_method is not None
-
-        self.quant_method.create_weights(
-            layer=self,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=self.intermediate_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=self.weight_loader,
-        )
-
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, weight_name: str,
-                      shard_id: str, expert_id: int) -> None:
-
-        if ("_qweight" in weight_name or "_scales" in weight_name
-                or "_qzeros" in weight_name):
-            if "w13" in weight_name:
-                shard_size = loaded_weight.size()[-1]
-                if shard_id == "w1":
-                    param.data[expert_id, :, :shard_size] = loaded_weight
-                elif shard_id == "w2" or shard_id == "w3":
-                    param.data[expert_id, :, shard_size:] = loaded_weight
-                else:
-                    raise ValueError(f"Invalid shard_id: {shard_id}: "
-                                     "must be w1, w2, or w3.")
-            elif "w2" in weight_name:
-                param.data[expert_id][:] = loaded_weight
-            else:
-                raise ValueError(f"Invalid weight name: {weight_name}: "
-                                 "must contain 'w13' or 'w2'.")
-        elif "_g_idx" in weight_name:
-            if "w13" not in weight_name and "w2" not in weight_name:
-                raise ValueError(f"Invalid weight name: {weight_name}: "
-                                 "must contain 'w13' or 'w2'.")
-            param.data[expert_id] = loaded_weight
-        else:
-            raise ValueError(f"Invalid weight name: {weight_name}.")
-
-    @staticmethod
-    def select_experts(hidden_states: torch.Tensor,
-                       router_logits: torch.Tensor,
-                       top_k: int,
-                       use_grouped_topk: bool,
-                       renormalize: bool,
-                       topk_group: Optional[int] = None,
-                       num_expert_group: Optional[int] = None):
-        assert (not use_grouped_topk and topk_group is None
-                and num_expert_group is None)
-        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-
-        topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
-                                            gating_output=router_logits,
-                                            topk=top_k,
-                                            renormalize=renormalize)
-
-        return topk_weights, topk_ids
-
-    def forward(self, hidden_states: torch.Tensor,
-                router_logits: torch.Tensor):
-        assert self.quant_method is not None
-
-        # Matrix multiply.
-        final_hidden_states = self.quant_method.apply(
-            layer=self,
-            x=hidden_states,
-            router_logits=router_logits,
-            top_k=self.top_k,
-            renormalize=self.renormalize,
-            use_grouped_topk=False,
-            topk_group=False,
-            num_expert_group=False)
-
-        if self.reduce_results and self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states
-
-    @classmethod
-    def make_expert_params_mapping(
-            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
-            ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, str]]:
-
-        return [
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_" if weight_name
-             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
-             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
-            for expert_id in range(num_experts) for shard_id, weight_name in [
-                ("w1", ckpt_gate_proj_name),
-                ("w2", ckpt_down_proj_name),
-                ("w3", ckpt_up_proj_name),
-            ]
-        ]
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 1588b2a6113a..15530e692eb3 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -8,8 +8,7 @@
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
     fused_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase,
-                                                        GPTQFusedMoE)
+                                                        FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -128,7 +127,7 @@ def get_quant_method(
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
-        elif isinstance(layer, GPTQFusedMoE):
+        elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
         return None
 

From 2fa03e5f5f0916ec8c36d446dcde526bf27d2b99 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 11:25:08 -0400
Subject: [PATCH 11/46] Make sure that GPTQ runs through mixtral.py

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 6 +++---
 vllm/model_executor/model_loader/utils.py              | 2 +-
 vllm/model_executor/models/mixtral.py                  | 6 ++++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 15530e692eb3..fbf384ea34dc 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Parameter
@@ -551,8 +551,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
-            size_k=(layer.intermediate_size if self.quant_config.desc_act else
-                    layer.intermediate_size_per_partition),
+            size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
@@ -575,6 +574,7 @@ def apply(
         use_grouped_topk: bool = False,
         num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
         topk_weights, topk_ids = FusedMoE.select_experts(
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 4bb943ab3afe..d247e4cf3f07 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -23,7 +23,7 @@ def get_model_architecture(
     architectures = getattr(model_config.hf_config, "architectures", [])
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = ["fp8", "compressed-tensors"]
+    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin"]
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported
             and "MixtralForCausalLM" in architectures):
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e744e36ac08b..6413b56605ec 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -435,7 +435,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if name.endswith("bias") and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -454,6 +454,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    if name.endswith("bias") and name not in params_dict:
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -464,7 +466,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    if name.endswith("bias") and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):

From b45594ccfc87097933850e553244dcad2645a3dc Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Wed, 4 Sep 2024 15:28:23 +0000
Subject: [PATCH 12/46] remove large model

---
 tests/weight_loading/models.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 5eee2cc53444..1dc529037a98 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,7 +21,6 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main

From 8a504d936aff8b3955f25ece553efb6366c52e3e Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 12:40:52 -0400
Subject: [PATCH 13/46] enforce float16A/scales for marlin moe

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++--
 vllm/model_executor/models/mixtral.py                  | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index fbf384ea34dc..d52ff3131fde 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -404,7 +404,7 @@ def create_weights(
             torch.empty(num_experts,
                         scales_size13,
                         2 * intermediate_size,
-                        dtype=params_dtype),
+                        dtype=torch.half),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
@@ -414,7 +414,7 @@ def create_weights(
             torch.empty(num_experts,
                         scales_size2,
                         hidden_size,
-                        dtype=params_dtype),
+                        dtype=torch.half),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6413b56605ec..148ef393277e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -95,11 +95,12 @@ def __init__(self,
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
+        orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape)
+        final_hidden_states = self.experts(hidden_states.half(), router_logits)
+        return final_hidden_states.view(orig_shape).to(orig_dtype)
 
 
 class MixtralAttention(nn.Module):

From effd2cd5cd96dd5737d605941e7bdb6066ee2816 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 13:10:02 -0400
Subject: [PATCH 14/46] Cleanup, comments

---
 csrc/moe/marlin_moe_ops.cu                    |  4 +-
 tests/kernels/test_moe.py                     |  1 -
 .../layers/fused_moe/__init__.py              |  8 +--
 .../layers/fused_moe/fused_marlin_moe.py      | 50 ++++++++-----------
 .../compressed_tensors_moe.py                 |  1 -
 5 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index e3c18ce5a50b..f6d475a56851 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -1228,8 +1228,6 @@ __device__ inline void MarlinMoESingle(
     if (slice_iters == 0) {
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
         if constexpr (w_type.size_bits() == 8) {
           if (s_sh_wr_pred) {
@@ -1237,6 +1235,8 @@ __device__ inline void MarlinMoESingle(
           }
           cp_async_fence();
         } else {
+          // For 4-bit per-column scales, we only fetch them here in the
+          // final step before write-out
           if (last) {
             if (s_sh_wr_pred) {
               cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 606997843982..7e359ff08088 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -241,7 +241,6 @@ def test_fused_marlin_moe(
         sort_indices2,
         topk_weights,
         topk_ids,
-        renormalize=False,
         w1_scale=scales1,
         w2_scale=scales2,
         num_bits=num_bits,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e9b5703ca28b..dea4a32aec4f 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,3 +1,5 @@
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -6,18 +8,16 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "fused_marlin_moe",
+    "single_marlin_moe",
 ]
 
 if HAS_TRITON:
-    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
 
     __all__ += [
-        "fused_marlin_moe",
-        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 5866c83cd9c8..c7906205760f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,6 +1,6 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Dict, Optional
 
 import torch
 
@@ -16,11 +16,10 @@ def single_marlin_moe(
     scales: torch.Tensor,
     gating_output: torch.Tensor,
     g_idx: torch.Tensor,
-    rand_perm: torch.Tensor,
+    perm: torch.Tensor,
     topk: int,
     renormalize: bool,
     override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
     num_bits: int = 8,
 ) -> torch.Tensor:
     """
@@ -28,18 +27,18 @@ def single_marlin_moe(
     and top-k gating mechanism. It is meant for testing and debugging.
 
     Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w (torch.Tensor): The first set of expert weights.
+    - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
+    - w (torch.Tensor): The set of expert weights.
+    - scales (torch.Tensor): The quantization scales.
     - gating_output (torch.Tensor): The output of the gating operation
         (before softmax).
+    - g_idx (torch.Tensor): The act_order indices.
+    - perm (torch.Tensor): The act_order input permutation.
     - topk (int): The number of top-k experts to select.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        product for w. Defaults to False.
+    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -55,8 +54,6 @@ def single_marlin_moe(
         torch.float32, torch.float16, torch.bfloat16
     ]
     assert num_bits in [4, 8]
-    # TODO support this
-    assert not use_fp8
 
     M, K = hidden_states.shape
     E = w.shape[0]
@@ -70,7 +67,7 @@ def single_marlin_moe(
                                         w.shape,
                                         w.shape,
                                         topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
+                                        None,
                                         override_config=override_config,
                                         is_marlin=True)
     config = get_config_func(M)
@@ -90,7 +87,7 @@ def single_marlin_moe(
 
     intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, rand_perm, workspace, scalar_type, M, N, K, True, E, topk,
+        g_idx, perm, workspace, scalar_type, M, N, K, True, E, topk,
         block_size_m, True, False)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
@@ -103,14 +100,11 @@ def fused_marlin_moe(
     gating_output: torch.Tensor,
     g_idx1: torch.Tensor,
     g_idx2: torch.Tensor,
-    rand_perm1: torch.Tensor,
-    rand_perm2: torch.Tensor,
+    perm1: torch.Tensor,
+    perm2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    custom_routing_function: Optional[Callable] = None,
-    renormalize: bool = True,
     override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     num_bits: int = 8,
@@ -125,18 +119,20 @@ def fused_marlin_moe(
     - w2 (torch.Tensor): The second set of expert weights.
     - gating_output (torch.Tensor): The output of the gating operation
         (before softmax).
-    - topk (int): The number of top-k experts to select.
+    - g_idx1 (torch.Tensor): The fist set of act_order indices.
+    - g_idx2 (torch.Tensor): The second set of act_order indices.
+    - perm1 (torch.Tensor): The first act_order input permutation.
+    - perm2 (torch.Tensor): The second act_order input permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
         w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -156,8 +152,6 @@ def fused_marlin_moe(
         torch.float32, torch.float16, torch.bfloat16
     ]
     assert num_bits in [4, 8]
-    # TODO support this
-    assert not use_fp8
 
     M, K = hidden_states.shape
     E = w1.shape[0]
@@ -169,7 +163,7 @@ def fused_marlin_moe(
         w1.shape,
         w2.shape,
         topk_ids.shape[1],
-        "float8" if use_fp8 else None,
+        None,
         override_config=override_config,
         is_marlin=True,
     )
@@ -202,7 +196,7 @@ def fused_marlin_moe(
         topk_ids,
         w1_scale,
         g_idx1,
-        rand_perm1,
+        perm1,
         workspace,
         scalar_type,
         M,
@@ -226,7 +220,7 @@ def fused_marlin_moe(
         topk_ids,
         w2_scale,
         g_idx2,
-        rand_perm2,
+        perm2,
         workspace,
         scalar_type,
         M,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index b14ef433d539..7dee2fca8115 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -293,7 +293,6 @@ def apply(
             layer.w2_g_idx_sort_indices,
             topk_weights,
             topk_ids,
-            renormalize=renormalize,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             num_bits=self.num_bits,

From ec47561fa40ddb9146a3c8b694c5f88016052652 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 13:13:32 -0400
Subject: [PATCH 15/46] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index d52ff3131fde..11012a326b04 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -598,7 +598,6 @@ def apply(
             layer.w2_g_idx_sort_indices,
             topk_weights,
             topk_ids,
-            renormalize=renormalize,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
             num_bits=self.quant_config.quant_type.size_bits,

From 9f97b3b08a9f29df4518c3141cb43f807ca89911 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Thu, 5 Sep 2024 21:07:45 +0000
Subject: [PATCH 16/46] update/fix weight loading to support tp

---
 vllm/model_executor/layers/fused_moe/layer.py | 80 ++++++++++---------
 .../layers/quantization/gptq_marlin.py        | 11 ++-
 2 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b0d7d4b538df..f4621e5c4ccc 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -306,10 +306,28 @@ def _load_single_value(self, param: torch.nn.Parameter,
         # Input scales can be loaded directly and should be equal.
         param_data[expert_id] = loaded_weight
 
+    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
+                    shard_dim: int, loaded_weight: torch.tensor, tp_rank: int):
+
+        if shard_id == "w2":
+            self._load_w2(shard_id=shard_id,
+                          shard_dim=shard_dim,
+                          loaded_weight=loaded_weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank)
+        else:
+            assert shard_id in ("w1", "w3")
+            expert_data.copy_(loaded_weight)
+
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: str, expert_id: int) -> None:
 
+        # llm-compressor returns weights on disk which are flipped
+        loaded_weight = loaded_weight.t().contiguous() if (
+            self.quant_method.__class__.__name__
+            == "CompressedTensorsMoEMethod") else loaded_weight
+
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                              f"got {shard_id}.")
@@ -325,38 +343,41 @@ def weight_loader(self, param: torch.nn.Parameter,
         expert_data = param.data[expert_id]
         tp_rank = get_tensor_model_parallel_rank()
 
-        # is_transposed: whether or not the parameter is transposed on disk
-        # If transposed, the loaded weight will be transposed and the dim
-        # to shard the loaded weight will be flipped.
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size is
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
         if is_transposed:
-            loaded_weight = loaded_weight.t().contiguous()
             shard_dim = ~shard_dim
 
-        # GPTQ Values
-        if ("scales" in weight_name or "qweight" in weight_name
-                or "qzeros" in weight_name):
-            if (shard_id == "w1" or shard_id == "w3"):
-                shard_dim = 1 - shard_dim
-            self._load_model_weight_or_group_weight_scale(
-                shard_id=shard_id,
-                shard_dim=shard_dim,
-                loaded_weight=loaded_weight,
-                expert_data=expert_data,
-                tp_rank=tp_rank)
-            return
+        # Case input scale: input_scale loading is only supported for fp8
+        if "input_scale" in weight_name:
+            if param.data[expert_id] != 1 and (param.data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param.data[expert_id]} "
+                    f"vs. {loaded_weight}")
 
-        if "g_idx" in weight_name:
             self._load_single_value(param=param,
                                     loaded_weight=loaded_weight,
                                     expert_id=expert_id)
             return
 
-        # Case weight_scales
-        if "weight_scale" in weight_name:
-            # load the weight scaling based on the quantization scheme
-            # supported weight scales can be found in
+        # Case g_idx
+        if "g_idx" in weight_name:
+            self._load_g_idx(shard_dim=0,
+                             shard_id=shard_id,
+                             loaded_weight=loaded_weight,
+                             expert_data=expert_data,
+                             tp_rank=tp_rank)
+            return
+
+        # Case weight scales and zero_points
+        if ("scale" in weight_name or "zero" in weight_name):
+            # load the weight scales and zp based on the quantization scheme
+            # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported
             # TODO @dsikka: once hardened, refactor to use vLLM Parameters
             # specific to each case
@@ -385,22 +406,9 @@ def weight_loader(self, param: torch.nn.Parameter,
                     f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
             return
 
+        # Case weight_shape
         if "weight_shape" in weight_name:
-            self._load_single_value(param=param,
-                                    loaded_weight=loaded_weight,
-                                    expert_id=expert_id)
-            return
-
-        # Case input scale
-        if "input_scale" in weight_name:
-            # Note: input_scale loading is only supported for fp8
-            if param.data[expert_id] != 1 and (param.data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param.data[expert_id]} "
-                    f"vs. {loaded_weight}")
-
+            # only required by compressed-tensors
             self._load_single_value(param=param,
                                     loaded_weight=loaded_weight,
                                     expert_id=expert_id)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 11012a326b04..c3b9adb1d198 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -7,8 +7,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
     fused_marlin_moe)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -372,9 +372,16 @@ def create_weights(
         if self.quant_config.group_size != -1:
             scales_size13 = hidden_size // self.quant_config.group_size
             scales_size2 = intermediate_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
         else:
             scales_size13 = 1
             scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": True
+        })
         # Fused gate_up_proj (column parallel)
         w13_qweight = torch.nn.Parameter(
             torch.empty(

From b841ac498f3441e0532a456c1872b24051072696 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 6 Sep 2024 03:12:10 -0400
Subject: [PATCH 17/46] remove 8-bit stuff for now

---
 csrc/moe/marlin_moe_ops.cu                    | 303 ++++++------------
 csrc/moe/marlin_moe_ops.h                     |   7 +-
 csrc/moe/torch_bindings.cpp                   |   8 +-
 tests/kernels/test_moe.py                     |  14 +-
 vllm/_custom_ops.py                           |   2 +-
 .../layers/fused_moe/__init__.py              |   8 +-
 .../layers/fused_moe/fused_marlin_moe.py      |  52 +--
 .../compressed_tensors_moe.py                 |   1 -
 .../schemes/compressed_tensors_wNa16.py       |   1 -
 .../layers/quantization/gptq_marlin.py        |   1 -
 10 files changed, 120 insertions(+), 277 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index f6d475a56851..92184f43c9eb 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -25,8 +25,6 @@
 
 #include <iostream>
 
-#include "core/scalar_type.hpp"
-
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -133,26 +131,11 @@ __device__ inline int lop3(int a, int b, int c) {
   return res;
 }
 
-// Constructs destination register by taking bytes from 2 sources (based on
-// mask)
-template <int start_byte, int mask>
-__device__ inline uint32_t prmt(uint32_t a) {
-  uint32_t res;
-  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
-               : "=r"(res)
-               : "r"(a), "n"(start_byte), "n"(mask));
-  return res;
-}
-
-template <vllm::ScalarTypeId w_type_id>
-__device__ inline FragB dequant(int q);
-
-// Efficiently dequantize 4bit values packed in an int32 value into a full
-// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
-// with some small changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
-template <>
-__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
   const int LO = 0x000f000f;
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
@@ -173,28 +156,6 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   return frag_b;
 }
 
-// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
-// Reference:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
-template <>
-__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
-  static constexpr uint32_t mask_for_elt_01 = 0x5250;
-  static constexpr uint32_t mask_for_elt_23 = 0x5351;
-  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
-
-  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
-  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
-
-  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
-
-  FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  return frag_b;
-}
-
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
@@ -335,8 +296,7 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   __syncthreads();
 }
 
-template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const int threads,          // number of threads in a threadblock
+template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -371,9 +331,6 @@ __device__ inline void MarlinMoESingle(
     bool apply_weights,    // apply weights to output
     int current_m_block    // current m block to start kernel computation from
 ) {
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
-  constexpr int pack_factor = 32 / w_type.size_bits();
-
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -466,15 +423,12 @@ __device__ inline void MarlinMoESingle(
   constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
-  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
-
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
-  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
   constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
@@ -511,12 +465,12 @@ __device__ inline void MarlinMoESingle(
       a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
   a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -617,7 +571,7 @@ __device__ inline void MarlinMoESingle(
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2][b_thread_vecs];
+  I4 frag_b_quant[2];
   FragC frag_c[thread_m_blocks][4][2];
   FragS frag_s[2][4];         // No act-order
   FragS act_frag_s[2][4][4];  // For act-order
@@ -683,10 +637,7 @@ __device__ inline void MarlinMoESingle(
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
         B_ptr[i] += b_gl_rd_delta_o;
       }
 
@@ -764,12 +715,8 @@ __device__ inline void MarlinMoESingle(
     for (int i = 0; i < thread_m_blocks; i++)
       ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-
-  #pragma unroll
-    for (int i = 0; i < b_thread_vecs; i++) {
-      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
-    }
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
   };
 
   bool is_same_group[stages];
@@ -893,19 +840,10 @@ __device__ inline void MarlinMoESingle(
   // dequantization and matmul operations.
   #pragma unroll
     for (int j = 0; j < 4; j++) {
-      int b_quant_0, b_quant_1;
-      if constexpr (w_type.size_bits() == 4) {
-        b_quant_0 = frag_b_quant[k % 2][0][j];
-        b_quant_1 = b_quant_0 >> 8;
-      } else {
-        static_assert(w_type.size_bits() == 8);
-        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
-        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
-        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
-      }
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
 
-      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
-      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
+      FragB frag_b0 = dequant(b_quant);
 
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
@@ -917,6 +855,8 @@ __device__ inline void MarlinMoESingle(
         }
       }
 
+      FragB frag_b1 = dequant(b_quant_shift);
+
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
@@ -941,13 +881,13 @@ __device__ inline void MarlinMoESingle(
   // multiple warps that accumulate their partial sums of the same output
   // location; which we have to reduce over in the end. We do in shared memory.
   auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    constexpr int red_off = threads / b_sh_stride / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride_threads;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
-                      (threadIdx.x % b_sh_stride_threads);
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
 
       // Parallel logarithmic shared memory reduction. We make sure to avoid any
       // unnecessary read or write iterations, e.g., for two warps we write only
@@ -1095,10 +1035,8 @@ __device__ inline void MarlinMoESingle(
     auto write = [&](int idx, float c0, float c1, FragS& s) {
       half2 res = __halves2half2(__float2half(c0), __float2half(c1));
 
-      // For per-column quantization we finally apply the scale here (only for
-      // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4) {
+      // For per-column quantization we finally apply the scale here
+      if constexpr (!has_act_order && group_blocks == -1) {
         res = __hmul2(res, s[0]);
       }
 
@@ -1228,70 +1166,28 @@ __device__ inline void MarlinMoESingle(
     if (slice_iters == 0) {
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
-        if constexpr (w_type.size_bits() == 8) {
+        if (last) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
           cp_async_fence();
-        } else {
-          // For 4-bit per-column scales, we only fetch them here in the
-          // final step before write-out
-          if (last) {
-            if (s_sh_wr_pred) {
-              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
-            }
-            cp_async_fence();
-          }
         }
       }
 
       thread_block_reduce();
       if constexpr (!has_act_order && group_blocks == -1) {
-        if constexpr (w_type.size_bits() == 8) {
+        if (last) {
           cp_async_wait<0>();
           __syncthreads();
           if (threadIdx.x / 32 < thread_n_blocks / 4) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
           }
-
-        } else {
-          if (last) {
-            cp_async_wait<0>();
-            __syncthreads();
-            if (threadIdx.x / 32 < thread_n_blocks / 4) {
-              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-            }
-          }
         }
       }
-
-      // For 8-bit channelwise, we apply the scale before the global reduction
-      // that converts the fp32 results to fp16 (so that we avoid possible
-      // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-          for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
-                          frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
-                          frag_s[j / 2][2 * (j % 2) + 0]);
-
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
-                          frag_s[j / 2][2 * (j % 2) + 1]);
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
-                          frag_s[j / 2][2 * (j % 2) + 1]);
-            }
-          }
-        }
-      }
-
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
@@ -1331,8 +1227,7 @@ __device__ inline void MarlinMoESingle(
   }
 }
 
-template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const int threads,          // number of threads in a threadblock
+template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1398,29 +1293,29 @@ __global__ void MarlinMoE(
   }
 
   if (max_block == 1) {
-    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 1, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 2) {
-    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 2, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 3) {
-    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 3, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else {
-    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 4, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
@@ -1447,8 +1342,7 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   return;
 }
 
-template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const int threads,          // number of threads in a threadblock
+template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1503,20 +1397,19 @@ const int STAGES = 4;  // 4 pipeline stages fit into shared memory
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 
-#define __CALL_IF_MOE(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
-                      THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS,           \
-                      NUM_THREADS)                                            \
-  else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&          \
+#define __CALL_IF_MOE(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
+                      HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)               \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                              \
            thread_n_blocks == THREAD_N_BLOCKS &&                              \
            thread_k_blocks == THREAD_K_BLOCKS &&                              \
            has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
            num_threads == NUM_THREADS) {                                      \
     cudaFuncSetAttribute(                                                     \
-        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+        MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,              \
                   THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,     \
-              THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>           \
+    MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
             A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
             g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
@@ -1601,43 +1494,42 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
   return thread_config_t{-1, -1, -1};
 }
 
-#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+#define CALL_IF_MOE(N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                               \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
                          const void* topk_ids, const void* s, const void* g_idx,
                          const void* perm, void* a_tmp, void* expert_offsets,
                          int prob_m, int prob_n, int prob_k, void* workspace,
-                         vllm::ScalarType const& q_type, bool has_act_order,
-                         bool is_k_full, int num_groups, int group_size,
-                         int num_experts, int topk, int moe_block_size, int dev,
-                         cudaStream_t stream, int thread_k, int thread_n,
-                         int sms, int max_par, bool replicate_input,
-                         bool apply_weights) {
+                         bool has_act_order, bool is_k_full, int num_groups,
+                         int group_size, int num_experts, int topk,
+                         int moe_block_size, int dev, cudaStream_t stream,
+                         int thread_k, int thread_n, int sms, int max_par,
+                         bool replicate_input, bool apply_weights) {
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -1719,13 +1611,10 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
     has_act_order = false;
   }
 
-  int pack_factor = 32 / q_type.size_bits();
-
   for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
     const int4* A_ptr = (const int4*)A;
     int4* a_tmp_ptr = (int4*)a_tmp;
-    const int4* B_ptr =
-        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
+    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
     int4* C_ptr = (int4*)C;
     const float* topk_weights_ptr = (const float*)topk_weights;
     const int* sorted_ids_ptr = (const int*)sorted_ids;
@@ -1756,14 +1645,10 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 
       if (false) {
       }
-      CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
-      CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
-      CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
-      CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
-      CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
-      CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
-      CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
-      CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
+      CALL_IF_MOE(16, 4, 256)
+      CALL_IF_MOE(8, 8, 256)
+      CALL_IF_MOE(8, 4, 128)
+      CALL_IF_MOE(4, 8, 128)
       else {
         TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
                                str(prob_n) + ", " + str(prob_k) + "]" +
@@ -1785,15 +1670,9 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
-    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
-    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights) {
-  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
-              "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
-
-  int pack_factor = 32 / b_q_type->size_bits();
-
   int max_par = 4;
 
   int dev = a.get_device();
@@ -1854,8 +1733,8 @@ torch::Tensor marlin_gemm_moe(
       topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
       g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
       expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
-      *b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts,
-      topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      has_act_order, is_k_full, num_groups, group_size, num_experts, topk,
+      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
 }
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index adee8399a4d6..43d264e0770d 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -2,14 +2,11 @@
 
 #include <torch/all.h>
 
-#include "core/scalar_type.hpp"
-
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
-    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
-    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index cd65a8ee92b9..8a0e625b43fa 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -13,11 +13,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, Tensor! workspace, "
-      "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
-      "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
-      "int moe_block_size, bool replicate_input, bool apply_weights)"
-      " -> Tensor");
+      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
+      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
+      "bool replicate_input, bool apply_weights) -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 7e359ff08088..2250cf1598b8 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -140,7 +140,6 @@ def compute_max_diff(output, output_ref):
 @pytest.mark.parametrize("topk", [2, 6])
 @pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
 @pytest.mark.parametrize("act_order", [True, False])
-@pytest.mark.parametrize("num_bits", [4, 8])
 def test_fused_marlin_moe(
     m: int,
     n: int,
@@ -149,7 +148,6 @@ def test_fused_marlin_moe(
     topk: int,
     group_size: int,
     act_order: bool,
-    num_bits: int,
 ):
     torch.manual_seed(7)
 
@@ -163,8 +161,7 @@ def test_fused_marlin_moe(
         if group_size in (k, n):
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
+    quant_type = scalar_types.uint4b8
     dtype = torch.float16
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -243,7 +240,6 @@ def test_fused_marlin_moe(
         topk_ids,
         w1_scale=scales1,
         w2_scale=scales2,
-        num_bits=num_bits,
     )
 
     assert compute_max_diff(marlin_output, triton_output) < 4e-2
@@ -258,7 +254,6 @@ def test_fused_marlin_moe(
 @pytest.mark.parametrize("topk", [2, 6])
 @pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
 @pytest.mark.parametrize("act_order", [True, False])
-@pytest.mark.parametrize("num_bits", [4, 8])
 def test_marlin_moe_mmm(
     m: int,
     n: int,
@@ -267,7 +262,6 @@ def test_marlin_moe_mmm(
     topk: int,
     group_size: int,
     act_order: bool,
-    num_bits: int,
 ):
     if topk > e:
         return
@@ -279,8 +273,7 @@ def test_marlin_moe_mmm(
         if group_size == k:
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
+    quant_type = scalar_types.uint4b8
     dtype = torch.float16
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
@@ -315,8 +308,7 @@ def test_marlin_moe_mmm(
                                       g_idx,
                                       sort_indices,
                                       topk,
-                                      renormalize=False,
-                                      num_bits=num_bits)
+                                      renormalize=False)
     torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
 
     assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 51db8b34e291..fe254732e730 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -314,7 +314,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
-    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+    output = torch.empty((num_experts, size_k // 16, size_n * 2),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index dea4a32aec4f..e9b5703ca28b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,3 @@
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -8,16 +6,18 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "fused_marlin_moe",
-    "single_marlin_moe",
 ]
 
 if HAS_TRITON:
+    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
 
     __all__ += [
+        "fused_marlin_moe",
+        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index c7906205760f..6b01ec0a623a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -7,21 +7,18 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
-from vllm.scalar_type import scalar_types
 
 
 def single_marlin_moe(
-    hidden_states: torch.Tensor,
-    w: torch.Tensor,
-    scales: torch.Tensor,
-    gating_output: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    override_config: Optional[Dict[str, Any]] = None,
-    num_bits: int = 8,
-) -> torch.Tensor:
+        hidden_states: torch.Tensor,
+        w: torch.Tensor,
+        scales: torch.Tensor,
+        gating_output: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        override_config: Optional[Dict[str, Any]] = None) -> torch.Tensor:
     """
     This function computes a Marlin MoE MMM using weights w
     and top-k gating mechanism. It is meant for testing and debugging.
@@ -38,7 +35,6 @@ def single_marlin_moe(
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
-    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -50,14 +46,11 @@ def single_marlin_moe(
     assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w.is_contiguous(), "Expert weights must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    assert num_bits in [4, 8]
+    assert hidden_states.dtype == torch.float16
 
     M, K = hidden_states.shape
     E = w.shape[0]
-    N = w.shape[2] // (num_bits // 2)
+    N = w.shape[2] // 2
 
     topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
                                         renormalize)
@@ -82,13 +75,10 @@ def single_marlin_moe(
                             device="cuda",
                             requires_grad=False)
 
-    scalar_type = (scalar_types.uint4b8
-                   if num_bits == 4 else scalar_types.uint8b128)
-
     intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, perm, workspace, scalar_type, M, N, K, True, E, topk,
-        block_size_m, True, False)
+        g_idx, perm, workspace, M, N, K, True, E, topk, block_size_m, True,
+        False)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
@@ -107,7 +97,6 @@ def fused_marlin_moe(
     override_config: Optional[Dict[str, Any]] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
-    num_bits: int = 8,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -132,7 +121,6 @@ def fused_marlin_moe(
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
         w2.
-    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -142,16 +130,13 @@ def fused_marlin_moe(
         0], "Number of tokens mismatch"
     assert hidden_states.shape[
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[1] == w2.shape[2] // (
-        num_bits // 2), "Hidden size mismatch w2"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    assert num_bits in [4, 8]
+    assert hidden_states.dtype == torch.float16
 
     M, K = hidden_states.shape
     E = w1.shape[0]
@@ -179,9 +164,6 @@ def fused_marlin_moe(
                             device="cuda",
                             requires_grad=False)
 
-    scalar_type = (scalar_types.uint4b8
-                   if num_bits == 4 else scalar_types.uint8b128)
-
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
         device=hidden_states.device,
@@ -198,7 +180,6 @@ def fused_marlin_moe(
         g_idx1,
         perm1,
         workspace,
-        scalar_type,
         M,
         2 * N,
         K,
@@ -222,7 +203,6 @@ def fused_marlin_moe(
         g_idx2,
         perm2,
         workspace,
-        scalar_type,
         M,
         K,
         N,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 7dee2fca8115..f8a41dfd08d7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -295,5 +295,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
-            num_bits=self.num_bits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7ca8eecb9283..e3b74e871290 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -18,7 +18,6 @@
 __all__ = ["CompressedTensorsWNA16"]
 WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
-    8: scalar_types.uint8b128,
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 11012a326b04..d114d5281284 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -600,5 +600,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
-            num_bits=self.quant_config.quant_type.size_bits,
         )

From 9d8a80cc9c07a4361279c3f890bbfcea65c33df7 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Sep 2024 15:13:54 +0000
Subject: [PATCH 18/46] fix; update large model testing cases

---
 .buildkite/test-pipeline.yaml                       | 13 ++++++++++++-
 tests/weight_loading/models-large.txt               |  3 +++
 tests/weight_loading/models.txt                     |  2 --
 .../compressed_tensors/compressed_tensors_moe.py    |  7 ++-----
 .../schemes/compressed_tensors_wNa16.py             |  1 +
 5 files changed, 18 insertions(+), 8 deletions(-)
 create mode 100644 tests/weight_loading/models-large.txt

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 86eddb576c42..900dc72e7446 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -375,7 +375,18 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
 
 
 ##### multi gpus test #####
diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt
new file mode 100644
index 000000000000..fe7670574676
--- /dev/null
+++ b/tests/weight_loading/models-large.txt
@@ -0,0 +1,3 @@
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
\ No newline at end of file
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 1dc529037a98..a3e382acf56b 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -19,8 +19,6 @@ compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
 compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index f8a41dfd08d7..49c29c2775cb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -6,8 +6,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    WNA16_SUPPORTED_BITS)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat)
 from vllm.model_executor.utils import set_weight_attrs
@@ -40,11 +38,10 @@ def __init__(
 
         if not (self.quant_config.quant_format
                 == CompressionFormat.pack_quantized.value
-                and self.num_bits in WNA16_SUPPORTED_BITS):
+                and self.num_bits == 4):
             raise ValueError("For Fused MoE layers, only ",
                              f"{CompressionFormat.pack_quantized.value} ",
-                             "is supported for the following bits: ",
-                             f"{WNA16_SUPPORTED_BITS}")
+                             "is supported for 4 bits")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index e3b74e871290..cae6ffad53df 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -18,6 +18,7 @@
 __all__ = ["CompressedTensorsWNA16"]
 WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
+    8: scalar_types.uint8b128
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 

From 315e22f7f86ad7f213a266b10938c5876587b61a Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Sep 2024 16:13:55 +0000
Subject: [PATCH 19/46] add hack to support unfused mixtral pathway for int8

---
 vllm/model_executor/model_loader/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index d247e4cf3f07..0052489d99dc 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -23,11 +23,19 @@ def get_model_architecture(
     architectures = getattr(model_config.hf_config, "architectures", [])
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin"]
+    mixtral_supported = ["fp8", "compressed-tensors"]
+    # for gptq_marlin, only run fused MoE for int4
+    if model_config.quantization == "gptq_marlin":
+        hf_quant_config = getattr(model_config.hf_config,
+                                  "quantization_config", None)
+        if hf_quant_config and hf_quant_config.get("bits") == 4:
+            mixtral_supported.append("gptq_marlin")
+
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
+
     return ModelRegistry.resolve_model_cls(architectures)
 
 

From 565cc4334d7ad9a2bc9d87cb8b0ae6db189eb1a9 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Sep 2024 18:29:36 +0000
Subject: [PATCH 20/46] fix install for tpu test

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 72e4149e3128..a73c462c148c 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -5,8 +5,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    fused_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -583,6 +581,8 @@ def apply(
         topk_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            fused_marlin_moe)
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,

From 8886423085ba84db2cea64dd24502620f7904009 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Sat, 7 Sep 2024 06:30:32 -0400
Subject: [PATCH 21/46] Move float16 typecast hack to gptq marlin moe method

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 +++
 vllm/model_executor/models/mixtral.py                  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a73c462c148c..1691139bedab 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -584,6 +584,9 @@ def apply(
         from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             fused_marlin_moe)
 
+        # The input must currently be float16
+        x = x.half()
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 148ef393277e..df7f39097bdc 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -99,7 +99,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states.half(), router_logits)
+        final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape).to(orig_dtype)
 
 

From ab274976a52486b6bf41c93b36d2e8fd62af91c2 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Sat, 7 Sep 2024 06:58:44 -0400
Subject: [PATCH 22/46] Move output type conversion to gptq method as well

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 vllm/model_executor/models/mixtral.py                  | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 1691139bedab..33899f1fb671 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -585,6 +585,7 @@ def apply(
             fused_marlin_moe)
 
         # The input must currently be float16
+        orig_dtype = x.dtype
         x = x.half()
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -610,4 +611,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
-        )
+        ).to(orig_dtype)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index df7f39097bdc..6413b56605ec 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -95,12 +95,11 @@ def __init__(self,
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
-        orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape).to(orig_dtype)
+        return final_hidden_states.view(orig_shape)
 
 
 class MixtralAttention(nn.Module):

From 847e8602334de1f8202cdae240cb139518b0f478 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:07:15 -0400
Subject: [PATCH 23/46] Enable 8-bit weights in Fused Marlin MoE

---
 csrc/moe/marlin_moe_ops.cu                    | 301 ++++++++++++------
 csrc/moe/marlin_moe_ops.h                     |   9 +-
 csrc/moe/torch_bindings.cpp                   |  11 +-
 tests/kernels/test_moe.py                     | 225 ++++++++++++-
 vllm/_custom_ops.py                           |   2 +-
 .../layers/fused_moe/__init__.py              |  16 +-
 .../layers/fused_moe/fused_moe.py             | 138 ++------
 .../layers/fused_moe/fused_moe_marlin.py      | 245 ++++++++++++++
 .../compressed_tensors_moe.py                 |  34 +-
 .../layers/quantization/utils/marlin_utils.py |  17 +
 .../quantization/utils/marlin_utils_test.py   |  11 +-
 .../layers/quantization/utils/quant_utils.py  |  19 +-
 12 files changed, 775 insertions(+), 253 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index 1e170e80d2f7..e3c18ce5a50b 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -25,6 +25,8 @@
 
 #include <iostream>
 
+#include "core/scalar_type.hpp"
+
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -131,11 +133,26 @@ __device__ inline int lop3(int a, int b, int c) {
   return res;
 }
 
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant(int q) {
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <vllm::ScalarTypeId w_type_id>
+__device__ inline FragB dequant(int q);
+
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+template <>
+__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int LO = 0x000f000f;
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
@@ -156,6 +173,28 @@ __device__ inline FragB dequant(int q) {
   return frag_b;
 }
 
+// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
+// Reference:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+template <>
+__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
@@ -296,7 +335,8 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   __syncthreads();
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -331,6 +371,9 @@ __device__ inline void MarlinMoESingle(
     bool apply_weights,    // apply weights to output
     int current_m_block    // current m block to start kernel computation from
 ) {
+  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  constexpr int pack_factor = 32 / w_type.size_bits();
+
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -423,12 +466,15 @@ __device__ inline void MarlinMoESingle(
   constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
   constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
@@ -465,12 +511,12 @@ __device__ inline void MarlinMoESingle(
       a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
   a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
 
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -571,7 +617,7 @@ __device__ inline void MarlinMoESingle(
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
+  I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
   FragS frag_s[2][4];         // No act-order
   FragS act_frag_s[2][4][4];  // For act-order
@@ -637,7 +683,10 @@ __device__ inline void MarlinMoESingle(
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
         B_ptr[i] += b_gl_rd_delta_o;
       }
 
@@ -715,8 +764,12 @@ __device__ inline void MarlinMoESingle(
     for (int i = 0; i < thread_m_blocks; i++)
       ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
   };
 
   bool is_same_group[stages];
@@ -840,10 +893,19 @@ __device__ inline void MarlinMoESingle(
   // dequantization and matmul operations.
   #pragma unroll
     for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
+      int b_quant_0, b_quant_1;
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k % 2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
 
-      FragB frag_b0 = dequant(b_quant);
+      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
+      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
 
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
@@ -855,8 +917,6 @@ __device__ inline void MarlinMoESingle(
         }
       }
 
-      FragB frag_b1 = dequant(b_quant_shift);
-
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
@@ -881,13 +941,13 @@ __device__ inline void MarlinMoESingle(
   // multiple warps that accumulate their partial sums of the same output
   // location; which we have to reduce over in the end. We do in shared memory.
   auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
 
       // Parallel logarithmic shared memory reduction. We make sure to avoid any
       // unnecessary read or write iterations, e.g., for two warps we write only
@@ -1035,8 +1095,10 @@ __device__ inline void MarlinMoESingle(
     auto write = [&](int idx, float c0, float c1, FragS& s) {
       half2 res = __halves2half2(__float2half(c0), __float2half(c1));
 
-      // For per-column quantization we finally apply the scale here
-      if constexpr (!has_act_order && group_blocks == -1) {
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 4) {
         res = __hmul2(res, s[0]);
       }
 
@@ -1169,25 +1231,67 @@ __device__ inline void MarlinMoESingle(
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (w_type.size_bits() == 8) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
           cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
+            cp_async_fence();
+          }
         }
       }
 
       thread_block_reduce();
       if constexpr (!has_act_order && group_blocks == -1) {
-        if (last) {
+        if constexpr (w_type.size_bits() == 8) {
           cp_async_wait<0>();
           __syncthreads();
           if (threadIdx.x / 32 < thread_n_blocks / 4) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
           }
+
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 &&
+                    w_type.size_bits() == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                          frag_s[j / 2][2 * (j % 2) + 0]);
+
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                          frag_s[j / 2][2 * (j % 2) + 1]);
+            }
+          }
         }
       }
+
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
@@ -1227,7 +1331,8 @@ __device__ inline void MarlinMoESingle(
   }
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1293,29 +1398,29 @@ __global__ void MarlinMoE(
   }
 
   if (max_block == 1) {
-    MarlinMoESingle<threads, 1, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 2) {
-    MarlinMoESingle<threads, 2, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 3) {
-    MarlinMoESingle<threads, 3, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else {
-    MarlinMoESingle<threads, 4, thread_n_blocks, thread_k_blocks, stages,
-                    has_act_order, group_blocks>(
+    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
+                    stages, has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
@@ -1342,7 +1447,8 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   return;
 }
 
-template <const int threads,          // number of threads in a threadblock
+template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1397,19 +1503,20 @@ const int STAGES = 4;  // 4 pipeline stages fit into shared memory
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 
-#define __CALL_IF_MOE(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
-                      HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)               \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                              \
+#define __CALL_IF_MOE(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
+                      THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS,           \
+                      NUM_THREADS)                                            \
+  else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&          \
            thread_n_blocks == THREAD_N_BLOCKS &&                              \
            thread_k_blocks == THREAD_K_BLOCKS &&                              \
            has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
            num_threads == NUM_THREADS) {                                      \
     cudaFuncSetAttribute(                                                     \
-        MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,              \
+        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
                   THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
+    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,     \
+              THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>           \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
             A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
             g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
@@ -1494,42 +1601,43 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
   return thread_config_t{-1, -1, -1};
 }
 
-#define CALL_IF_MOE(N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                               \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                               \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
                          const void* topk_ids, const void* s, const void* g_idx,
                          const void* perm, void* a_tmp, void* expert_offsets,
                          int prob_m, int prob_n, int prob_k, void* workspace,
-                         bool has_act_order, bool is_k_full, int num_groups,
-                         int group_size, int num_experts, int topk,
-                         int moe_block_size, int dev, cudaStream_t stream,
-                         int thread_k, int thread_n, int sms, int max_par,
-                         bool replicate_input, bool apply_weights) {
+                         vllm::ScalarType const& q_type, bool has_act_order,
+                         bool is_k_full, int num_groups, int group_size,
+                         int num_experts, int topk, int moe_block_size, int dev,
+                         cudaStream_t stream, int thread_k, int thread_n,
+                         int sms, int max_par, bool replicate_input,
+                         bool apply_weights) {
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -1611,10 +1719,13 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
     has_act_order = false;
   }
 
+  int pack_factor = 32 / q_type.size_bits();
+
   for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
     const int4* A_ptr = (const int4*)A;
     int4* a_tmp_ptr = (int4*)a_tmp;
-    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
+    const int4* B_ptr =
+        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
     int4* C_ptr = (int4*)C;
     const float* topk_weights_ptr = (const float*)topk_weights;
     const int* sorted_ids_ptr = (const int*)sorted_ids;
@@ -1645,10 +1756,14 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 
       if (false) {
       }
-      CALL_IF_MOE(16, 4, 256)
-      CALL_IF_MOE(8, 8, 256)
-      CALL_IF_MOE(8, 4, 128)
-      CALL_IF_MOE(4, 8, 128)
+      CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
+      CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
+      CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
+      CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
+      CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
+      CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
+      CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
+      CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
       else {
         TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
                                str(prob_n) + ", " + str(prob_k) + "]" +
@@ -1670,9 +1785,15 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights) {
+  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
+              "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
+
+  int pack_factor = 32 / b_q_type->size_bits();
+
   int max_par = 4;
 
   int dev = a.get_device();
@@ -1733,8 +1854,8 @@ torch::Tensor marlin_gemm_moe(
       topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
       g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
       expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
-      has_act_order, is_k_full, num_groups, group_size, num_experts, topk,
-      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      *b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts,
+      topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
-}
\ No newline at end of file
+}
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index 01ba8ff69850..adee8399a4d6 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -2,11 +2,14 @@
 
 #include <torch/all.h>
 
+#include "core/scalar_type.hpp"
+
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
-    bool replicate_input, bool apply_weights);
\ No newline at end of file
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d4d43e2c601b..d2352375de33 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -9,16 +9,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "token_expert_indices, Tensor gating_output) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
-#ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
-      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
-      "bool replicate_input, bool apply_weights) -> Tensor");
-
+      "g_idx, Tensor! perm, Tensor! workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
+      "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
-#endif
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index f526c381b333..f7642bf02b05 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -2,6 +2,8 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+from typing import List
+
 import pytest
 import torch
 from transformers import MixtralConfig
@@ -9,7 +11,12 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+    fused_moe_marlin, single_moe_marlin)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.scalar_type import scalar_types
 
 
 def torch_moe(a, w1, w2, score, topk):
@@ -29,6 +36,20 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
+def torch_moe_single(a, w, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    _, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = a[mask] @ w[i].transpose(0, 1)
+    return (out.view(B, -1, w.shape[1])).sum(dim=1)
+
+
 @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -43,11 +64,11 @@ def test_fused_moe(
     topk: int,
     dtype: torch.dtype,
 ):
-    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
-    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
     torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
@@ -99,3 +120,199 @@ def test_mixtral_moe(dtype: torch.dtype):
                                vllm_states,
                                rtol=mixtral_moe_tol[dtype],
                                atol=mixtral_moe_tol[dtype])
+
+
+def stack_and_dev(tensors: List[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+    num_bits: int,
+):
+    torch.manual_seed(7)
+
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+
+    quant_type = (scalar_types.uint4b8
+                  if num_bits == 4 else scalar_types.uint8b128)
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    for i in range(w2.shape[0]):
+        w2[0] = torch.eye(k, n, device="cuda", dtype=dtype)
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+            w1[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref1_l.append(w_ref1)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l)
+    sort_indices1 = stack_and_dev(sort_indices1_l)
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+            w2[i].transpose(1, 0), quant_type, group_size, act_order,
+            test_perm)
+        w_ref2_l.append(w_ref2)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l)
+    sort_indices2 = stack_and_dev(sort_indices2_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    triton_output = fused_moe(
+        a,
+        w_ref1.transpose(1, 2).contiguous(),
+        w_ref2.transpose(1, 2).contiguous(),
+        score,
+        topk,
+        renormalize=False,
+    )
+    marlin_output = fused_moe_marlin(
+        a,
+        qweight1,
+        qweight2,
+        score,
+        g_idx1,
+        g_idx2,
+        sort_indices1,
+        sort_indices2,
+        topk,
+        renormalize=False,
+        w1_scale=scales1,
+        w2_scale=scales2,
+        num_bits=num_bits,
+    )
+
+    assert compute_max_diff(marlin_output, triton_output) < 4e-2
+
+
+@pytest.mark.skip("This test is here for the sake of debugging, "
+                  "don't run it in automated tests.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [4, 8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [True, False])
+@pytest.mark.parametrize("num_bits", [4, 8])
+def test_marlin_moe_mmm(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+    act_order: bool,
+    num_bits: int,
+):
+    if topk > e:
+        return
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == k:
+            return
+
+    quant_type = (scalar_types.uint4b8
+                  if num_bits == 4 else scalar_types.uint8b128)
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
+
+    w_ref_l = []
+    qweights_l = []
+    scales_l = []
+    g_idx_l = []
+    sort_indices_l = []
+
+    for i in range(w.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
+            w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
+        w_ref_l.append(w_ref)
+        qweights_l.append(qweight)
+        scales_l.append(scales)
+        g_idx_l.append(g_idx)
+        sort_indices_l.append(sort_indices)
+
+    w_ref = stack_and_dev(w_ref_l)
+    qweight = stack_and_dev(qweights_l).contiguous()
+    scales = stack_and_dev(scales_l)
+    g_idx = stack_and_dev(g_idx_l)
+    sort_indices = stack_and_dev(sort_indices_l)
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    marlin_output = single_moe_marlin(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      g_idx,
+                                      sort_indices,
+                                      topk,
+                                      renormalize=False,
+                                      num_bits=num_bits)
+    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+
+    assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 151cdbee8eb0..77c46584ef53 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -308,7 +308,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
-    output = torch.empty((num_experts, size_k // 16, size_n * 2),
+    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index fd6f41b90042..65a9b78a118c 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,17 +1,23 @@
+from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
-__all__ = ["FusedMoE", "FusedMoEMethodBase", "FusedMoeWeightScaleSupported"]
+__all__ = [
+    "FusedMoE",
+    "FusedMoEMethodBase",
+    "FusedMoeWeightScaleSupported",
+    "fused_moe_marlin",
+    "single_moe_marlin",
+]
 
 if HAS_TRITON:
-
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_marlin_moe, fused_moe, fused_topk,
-        get_config_file_name, grouped_topk)
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
 
     __all__ += [
-        "fused_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 05169eaddb25..bd13d8fecbb9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -323,15 +323,22 @@ def get_moe_configs(E: int, N: int,
     return None
 
 
-def get_default_config(M: int, E: int, N: int, K: int, topk: int,
-                       dtype: Optional[str],
-                       is_marlin: bool) -> Dict[str, int]:
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+) -> Dict[str, int]:
     config = {
         'BLOCK_SIZE_M': 64,
         'BLOCK_SIZE_N': 64,
         'BLOCK_SIZE_K': 32,
         'GROUP_SIZE_M': 8
     }
+    # A heuristic: fused marlin works faster with this config for small M
     if M <= E or (is_marlin and M <= 32):
         config = {
             'BLOCK_SIZE_M': 16,
@@ -342,14 +349,15 @@ def get_default_config(M: int, E: int, N: int, K: int, topk: int,
     return config
 
 
-def try_get_optimal_moe_config(w1_shape: Tuple[int, ...],
-                               w2_shape: Tuple[int, ...],
-                               top_k: int,
-                               dtype: Optional[str],
-                               M: int,
-                               override_config: Optional[Dict[str,
-                                                              Any]] = None,
-                               is_marlin: bool = False):
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    override_config: Optional[Dict[str, Any]] = None,
+    is_marlin: bool = False,
+):
     if override_config:
         config = override_config
     else:
@@ -391,6 +399,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
+
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -437,113 +446,6 @@ def grouped_topk(hidden_states: torch.Tensor,
     return topk_weights, topk_ids
 
 
-def fused_marlin_moe(hidden_states: torch.Tensor,
-                     w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     gating_output: torch.Tensor,
-                     g_idx1: torch.Tensor,
-                     g_idx2: torch.Tensor,
-                     rand_perm1: torch.Tensor,
-                     rand_perm2: torch.Tensor,
-                     topk: int,
-                     custom_routing_function: Optional[Callable] = None,
-                     renormalize: bool = True,
-                     override_config: Optional[Dict[str, Any]] = None,
-                     use_fp8: bool = False,
-                     w1_scale: Optional[torch.Tensor] = None,
-                     w2_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    This function computes a Mixture of Experts (MoE) layer using two sets of
-    weights, w1 and w2, and top-k gating mechanism.
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w1 (torch.Tensor): The first set of expert weights.
-    - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk (int): The number of top-k experts to select.
-    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override
-        for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
-    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w1.
-    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
-        w2.
-    Returns:
-    - torch.Tensor: The output tensor after applying the MoE layer.
-    """
-    # Check constraints.
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
-    assert hidden_states.shape[
-        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[
-        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-
-    #TODO fp8 is not implemented yet
-    assert not use_fp8
-
-    M, K = hidden_states.shape
-    E = w1.shape[0]
-    N = w2.shape[1] * 16
-
-    if custom_routing_function is None:
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
-    else:
-        topk_weights, topk_ids = custom_routing_function(
-            hidden_states, gating_output, topk, renormalize)
-
-    get_config_func = functools.partial(try_get_optimal_moe_config,
-                                        w1.shape,
-                                        w2.shape,
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
-                                        override_config=override_config,
-                                        is_marlin=True)
-    config = get_config_func(M)
-
-    block_size_m = config['BLOCK_SIZE_M']
-
-    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
-
-    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
-    workspace = torch.zeros(max_workspace_size,
-                            dtype=torch.int,
-                            device="cuda",
-                            requires_grad=False)
-
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-
-    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
-        hidden_states, w1, sorted_token_ids, topk_weights, topk_ids, w1_scale,
-        g_idx1, rand_perm1, workspace, M, 2 * N, K, True, E, topk,
-        block_size_m, True, False)
-
-    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-
-    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
-        intermediate_cache2, w2, sorted_token_ids, topk_weights, topk_ids,
-        w2_scale, g_idx2, rand_perm2, workspace, M, K, N, True, E, topk,
-        block_size_m, False, True)
-
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
-
-
 def get_config_dtype_str(dtype: torch.dtype,
                          use_int8_w8a16: Optional[bool] = False,
                          use_fp8_w8a8: Optional[bool] = False):
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
new file mode 100644
index 000000000000..40f9f66f1706
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -0,0 +1,245 @@
+"""Fused MoE utilities for GPTQ."""
+import functools
+from typing import Any, Dict, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.scalar_type import scalar_types
+
+from .fused_moe import (fused_topk, moe_align_block_size,
+                        try_get_optimal_moe_config)
+
+
+def single_moe_marlin(
+    hidden_states: torch.Tensor,
+    w: torch.Tensor,
+    scales: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx: torch.Tensor,
+    rand_perm: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    num_bits: int = 8,
+) -> torch.Tensor:
+    """
+    This function computes a Marlin MoE MMM using weights w
+    and top-k gating mechanism. It is meant for testing and debugging.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w (torch.Tensor): The first set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        product for w. Defaults to False.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+    assert hidden_states.shape[1] == w.shape[1] * 16, "Hidden size mismatch"
+    assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w.is_contiguous(), "Expert weights must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert num_bits in [4, 8]
+    # TODO support this
+    assert not use_fp8
+
+    M, K = hidden_states.shape
+    E = w.shape[0]
+    N = w.shape[2] // (num_bits // 2)
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    # This might not be an optimal config for a single MMM
+    get_config_func = functools.partial(try_get_optimal_moe_config,
+                                        w.shape,
+                                        w.shape,
+                                        topk_ids.shape[1],
+                                        "float8" if use_fp8 else None,
+                                        override_config=override_config,
+                                        is_marlin=True)
+    config = get_config_func(M)
+
+    block_size_m = config['BLOCK_SIZE_M']
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = (N // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    scalar_type = (scalar_types.uint4b8
+                   if num_bits == 4 else scalar_types.uint8b128)
+
+    intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
+        g_idx, rand_perm, workspace, scalar_type, M, N, K, True, E, topk,
+        block_size_m, True, False)
+
+    return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
+
+
+def fused_moe_marlin(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    g_idx1: torch.Tensor,
+    g_idx2: torch.Tensor,
+    rand_perm1: torch.Tensor,
+    rand_perm2: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    num_bits: int = 8,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[
+        0], "Number of tokens mismatch"
+    assert hidden_states.shape[
+        1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2), "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert num_bits in [4, 8]
+    # TODO support this
+    assert not use_fp8
+
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+
+    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                        renormalize)
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        "float8" if use_fp8 else None,
+        override_config=override_config,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m, E)
+
+    max_workspace_size = ((M + 255) // 256) * (max(2 * N, K) // 64) * 16
+    workspace = torch.zeros(max_workspace_size,
+                            dtype=torch.int,
+                            device="cuda",
+                            requires_grad=False)
+
+    scalar_type = (scalar_types.uint4b8
+                   if num_bits == 4 else scalar_types.uint8b128)
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    intermediate_cache1 = torch.ops._moe_C.marlin_gemm_moe(
+        hidden_states,
+        w1,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w1_scale,
+        g_idx1,
+        rand_perm1,
+        workspace,
+        scalar_type,
+        M,
+        2 * N,
+        K,
+        True,
+        E,
+        topk,
+        block_size_m,
+        True,
+        False,
+    )
+
+    ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+
+    intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe(
+        intermediate_cache2,
+        w2,
+        sorted_token_ids,
+        topk_weights,
+        topk_ids,
+        w2_scale,
+        g_idx2,
+        rand_perm2,
+        workspace,
+        scalar_type,
+        M,
+        K,
+        N,
+        True,
+        E,
+        topk,
+        block_size_m,
+        False,
+        True,
+    )
+
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
+                     dim=1)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 36323493d601..abdc28bfebcc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -269,19 +269,21 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
-        from vllm.model_executor.layers.fused_moe.fused_moe import (
-            fused_marlin_moe)
-
-        return fused_marlin_moe(x,
-                                layer.w13_weight_packed,
-                                layer.w2_weight_packed,
-                                router_logits,
-                                layer.w13_g_idx,
-                                layer.w2_g_idx,
-                                layer.w13_g_idx_sort_indices,
-                                layer.w2_g_idx_sort_indices,
-                                top_k,
-                                custom_routing_function,
-                                renormalize=renormalize,
-                                w1_scale=layer.w13_weight_scale,
-                                w2_scale=layer.w2_weight_scale)
+        from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+            fused_moe_marlin)
+
+        return fused_moe_marlin(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            top_k,
+            renormalize=renormalize,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            num_bits=self.num_bits,
+        )
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 0ec68ac5b0f2..699d5f184414 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -176,6 +176,23 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
 def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 7d08ac6f8746..4a06c5d63d52 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -1,6 +1,6 @@
 """Utility functions used for tests and benchmarks"""
 
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import torch
@@ -92,8 +92,11 @@ def get_weight_perm(num_bits: int):
     return perm
 
 
-def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
-                    act_order: bool):
+def marlin_quantize(w: torch.Tensor,
+                    quant_type: ScalarType,
+                    group_size: int,
+                    act_order: bool,
+                    test_perm: Optional[torch.Tensor] = None):
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -104,7 +107,7 @@ def marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int,
 
     # Quantize (and apply act_order if provided)
     w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
-        w, quant_type, group_size, act_order)
+        w, quant_type, group_size, act_order, test_perm)
 
     # For act_order, sort the "weights" and "g_idx" so that group ids are
     # increasing
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 33f24ff5d54d..bdfda31de852 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,5 +1,5 @@
 """This file is used for /tests and /benchmarks"""
-from typing import List
+from typing import List, Optional
 
 import numpy
 import torch
@@ -53,7 +53,10 @@ def get_pack_factor(num_bits):
     return 32 // num_bits
 
 
-def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
+def permute_rows(q_w: torch.Tensor,
+                 w_ref: torch.Tensor,
+                 group_size: int,
+                 test_perm: Optional[torch.Tensor] = None):
     assert q_w.shape == w_ref.shape
 
     orig_device = q_w.device
@@ -64,7 +67,7 @@ def permute_rows(q_w: torch.Tensor, w_ref: torch.Tensor, group_size: int):
         g_idx[i] = i // group_size
 
     # Simulate act_order by doing a random permutation on K
-    rand_perm = torch.randperm(k_size)
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
 
     g_idx = g_idx[rand_perm].contiguous()
     q_w = q_w[rand_perm, :].contiguous()
@@ -164,8 +167,11 @@ def reshape_w(w):
     )
 
 
-def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
-                          group_size: int, act_order: bool):
+def gptq_quantize_weights(w: torch.Tensor,
+                          quant_type: ScalarType,
+                          group_size: int,
+                          act_order: bool,
+                          test_perm: Optional[torch.Tensor] = None):
     size_k, _ = w.shape
 
     assert w.is_floating_point(), "w must be float"
@@ -186,7 +192,8 @@ def gptq_quantize_weights(w: torch.Tensor, quant_type: ScalarType,
         ), "For act_order, groupsize = {} must be less than size_k = {}".format(
             group_size, size_k)
 
-        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size)
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
+                                                    test_perm)
 
     return w_ref, w_q, w_s, g_idx, rand_perm
 

From 430a9cb0f3c61702fbfeb8c59a7fdaac44344ae8 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:36:33 -0400
Subject: [PATCH 24/46] fix rocm

---
 csrc/moe/torch_bindings.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d2352375de33..e4fce091d24a 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -9,6 +9,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "token_expert_indices, Tensor gating_output) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
 
+#ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
@@ -19,5 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 }
+#endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

From 48047aae2510b6e5de588032797c4cc4059650fc Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 30 Aug 2024 09:45:52 -0400
Subject: [PATCH 25/46] bad paste

---
 csrc/moe/torch_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index e4fce091d24a..cd65a8ee92b9 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "int moe_block_size, bool replicate_input, bool apply_weights)"
       " -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
-}
 #endif
+}
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

From bfc4faed9562603fcc71c92d2c9fc293d9cc2130 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 30 Aug 2024 17:29:42 +0000
Subject: [PATCH 26/46] add test case; fix imports for tests

---
 tests/weight_loading/models.txt                          | 1 +
 vllm/model_executor/layers/fused_moe/__init__.py         | 8 ++++----
 vllm/model_executor/layers/fused_moe/fused_moe_marlin.py | 5 ++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 1dc529037a98..5eee2cc53444 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,6 +21,7 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 65a9b78a118c..06bd2706d7e4 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,3 @@
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -8,16 +6,18 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "fused_moe_marlin",
-    "single_moe_marlin",
 ]
 
 if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
+    from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+        fused_moe_marlin, single_moe_marlin)
 
     __all__ += [
+        "fused_moe_marlin",
+        "single_moe_marlin",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 40f9f66f1706..40b409ebeb34 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -5,11 +5,10 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
 
-from .fused_moe import (fused_topk, moe_align_block_size,
-                        try_get_optimal_moe_config)
-
 
 def single_moe_marlin(
     hidden_states: torch.Tensor,

From c5a2f6282cd60fa158f23536399dbbc98896bc63 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 30 Aug 2024 20:12:47 +0000
Subject: [PATCH 27/46] fix to adapt custom_routin_function

---
 .../layers/fused_moe/fused_moe_marlin.py            | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 40b409ebeb34..8c49333f7c84 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -1,6 +1,6 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, Optional
 
 import torch
 
@@ -106,7 +106,8 @@ def fused_moe_marlin(
     rand_perm1: torch.Tensor,
     rand_perm2: torch.Tensor,
     topk: int,
-    renormalize: bool,
+    custom_routing_function: Optional[Callable] = None,
+    renormalize: bool = True,
     override_config: Optional[Dict[str, Any]] = None,
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
@@ -161,8 +162,12 @@ def fused_moe_marlin(
     E = w1.shape[0]
     N = w2.shape[1] * 16
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    if custom_routing_function is None:
+        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+                                            renormalize)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize)
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,

From 2b308c469a446aca61aa225867012fdef1513168 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 2 Sep 2024 03:04:07 -0400
Subject: [PATCH 28/46] Use select_experts to compute top_k tensors in fused
 moe

---
 tests/kernels/test_moe.py                         |  7 ++++++-
 .../layers/fused_moe/fused_moe_marlin.py          | 11 +++--------
 .../compressed_tensors/compressed_tensors_moe.py  | 15 +++++++++++++--
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index f7642bf02b05..2cfd76d1c780 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -11,6 +11,7 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
     fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -218,6 +219,9 @@ def test_fused_marlin_moe(
     sort_indices2 = stack_and_dev(sort_indices2_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+
     triton_output = fused_moe(
         a,
         w_ref1.transpose(1, 2).contiguous(),
@@ -235,7 +239,8 @@ def test_fused_marlin_moe(
         g_idx2,
         sort_indices1,
         sort_indices2,
-        topk,
+        topk_weights,
+        topk_ids,
         renormalize=False,
         w1_scale=scales1,
         w2_scale=scales2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
index 8c49333f7c84..45dead9740f4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -105,7 +105,8 @@ def fused_moe_marlin(
     g_idx2: torch.Tensor,
     rand_perm1: torch.Tensor,
     rand_perm2: torch.Tensor,
-    topk: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
     custom_routing_function: Optional[Callable] = None,
     renormalize: bool = True,
     override_config: Optional[Dict[str, Any]] = None,
@@ -161,13 +162,7 @@ def fused_moe_marlin(
     M, K = hidden_states.shape
     E = w1.shape[0]
     N = w2.shape[1] * 16
-
-    if custom_routing_function is None:
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
-    else:
-        topk_weights, topk_ids = custom_routing_function(
-            hidden_states, gating_output, topk, renormalize)
+    topk = topk_ids.shape[1]
 
     get_config_func = functools.partial(
         try_get_optimal_moe_config,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index abdc28bfebcc..53769cb73153 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -5,7 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
@@ -272,6 +272,16 @@ def apply(
         from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
             fused_moe_marlin)
 
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function)
+
         return fused_moe_marlin(
             x,
             layer.w13_weight_packed,
@@ -281,7 +291,8 @@ def apply(
             layer.w2_g_idx,
             layer.w13_g_idx_sort_indices,
             layer.w2_g_idx_sort_indices,
-            top_k,
+            topk_weights,
+            topk_ids,
             renormalize=renormalize,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,

From 71256d45a491b896699416e74df87751ae1cdfc3 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 3 Sep 2024 10:42:10 -0400
Subject: [PATCH 29/46] bring back fused_moe_marlin -> fused_marlin_moe

---
 tests/kernels/test_moe.py                                 | 8 ++++----
 vllm/model_executor/layers/fused_moe/__init__.py          | 8 ++++----
 .../{fused_moe_marlin.py => fused_marlin_moe.py}          | 4 ++--
 .../compressed_tensors/compressed_tensors_moe.py          | 6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)
 rename vllm/model_executor/layers/fused_moe/{fused_moe_marlin.py => fused_marlin_moe.py} (99%)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2cfd76d1c780..606997843982 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -11,9 +11,9 @@
 
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_quantize)
 from vllm.model_executor.models.mixtral import MixtralMoE
@@ -230,7 +230,7 @@ def test_fused_marlin_moe(
         topk,
         renormalize=False,
     )
-    marlin_output = fused_moe_marlin(
+    marlin_output = fused_marlin_moe(
         a,
         qweight1,
         qweight2,
@@ -309,7 +309,7 @@ def test_marlin_moe_mmm(
     sort_indices = stack_and_dev(sort_indices_l)
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    marlin_output = single_moe_marlin(a,
+    marlin_output = single_marlin_moe(a,
                                       qweight,
                                       scales,
                                       score,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 06bd2706d7e4..e9b5703ca28b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -9,15 +9,15 @@
 ]
 
 if HAS_TRITON:
+    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
-    from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-        fused_moe_marlin, single_moe_marlin)
 
     __all__ += [
-        "fused_moe_marlin",
-        "single_moe_marlin",
+        "fused_marlin_moe",
+        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
similarity index 99%
rename from vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
rename to vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 45dead9740f4..5866c83cd9c8 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -10,7 +10,7 @@
 from vllm.scalar_type import scalar_types
 
 
-def single_moe_marlin(
+def single_marlin_moe(
     hidden_states: torch.Tensor,
     w: torch.Tensor,
     scales: torch.Tensor,
@@ -96,7 +96,7 @@ def single_moe_marlin(
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
 
-def fused_moe_marlin(
+def fused_marlin_moe(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 53769cb73153..b14ef433d539 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -269,8 +269,8 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
-        from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-            fused_moe_marlin)
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            fused_marlin_moe)
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -282,7 +282,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function)
 
-        return fused_moe_marlin(
+        return fused_marlin_moe(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,

From 7aa844c8561768190443ebf84ff29021e5d70a9a Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 3 Sep 2024 11:03:53 -0400
Subject: [PATCH 30/46] GPTQ Fused MoE class

---
 .../layers/fused_moe/__init__.py              |   3 +-
 vllm/model_executor/layers/fused_moe/layer.py | 155 +++++++++++++++++-
 2 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e9b5703ca28b..7f27e2660db6 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,11 +1,12 @@
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, GPTQFusedMoE)
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "GPTQFusedMoE",
 ]
 
 if HAS_TRITON:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3df0b61a9ebe..9643642b9b53 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -498,4 +498,157 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
                 param_data[expert_id][idx] = loaded_weight
             # If we are in the row parallel case (down_proj)
             else:
-                param_data[expert_id] = loaded_weight
\ No newline at end of file
+                param_data[expert_id] = loaded_weight
+
+
+class GPTQFusedMoE(torch.nn.Module):
+    """GPTQFusedMoE layer for GPTQ MoE models.
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    w13) and RowParallelLinear weights (down_proj/ w2).
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to all all_reduce on the output of the layer
+        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.tp_size = (tp_size if tp_size is not None else
+                        get_tensor_model_parallel_world_size())
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.intermediate_size = intermediate_size
+        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
+        assert (not use_grouped_topk and num_expert_group is None
+                and topk_group is None)
+
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedFusedMoEMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix)
+        assert self.quant_method is not None
+
+        self.quant_method.create_weights(
+            layer=self,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=self.intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, weight_name: str,
+                      shard_id: str, expert_id: int) -> None:
+
+        if ("_qweight" in weight_name or "_scales" in weight_name
+                or "_qzeros" in weight_name):
+            if "w13" in weight_name:
+                shard_size = loaded_weight.size()[-1]
+                if shard_id == "w1":
+                    param.data[expert_id, :, :shard_size] = loaded_weight
+                elif shard_id == "w2" or shard_id == "w3":
+                    param.data[expert_id, :, shard_size:] = loaded_weight
+                else:
+                    raise ValueError(f"Invalid shard_id: {shard_id}: "
+                                     "must be w1, w2, or w3.")
+            elif "w2" in weight_name:
+                param.data[expert_id][:] = loaded_weight
+            else:
+                raise ValueError(f"Invalid weight name: {weight_name}: "
+                                 "must contain 'w13' or 'w2'.")
+        elif "_g_idx" in weight_name:
+            if "w13" not in weight_name and "w2" not in weight_name:
+                raise ValueError(f"Invalid weight name: {weight_name}: "
+                                 "must contain 'w13' or 'w2'.")
+            param.data[expert_id] = loaded_weight
+        else:
+            raise ValueError(f"Invalid weight name: {weight_name}.")
+
+    @staticmethod
+    def select_experts(hidden_states: torch.Tensor,
+                       router_logits: torch.Tensor,
+                       top_k: int,
+                       use_grouped_topk: bool,
+                       renormalize: bool,
+                       topk_group: Optional[int] = None,
+                       num_expert_group: Optional[int] = None):
+        assert (not use_grouped_topk and topk_group is None
+                and num_expert_group is None)
+        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+
+        topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
+                                            gating_output=router_logits,
+                                            topk=top_k,
+                                            renormalize=renormalize)
+
+        return topk_weights, topk_ids
+
+    def forward(self, hidden_states: torch.Tensor,
+                router_logits: torch.Tensor):
+        assert self.quant_method is not None
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=False,
+            topk_group=False,
+            num_expert_group=False)
+
+        if self.reduce_results and self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+            ckpt_up_proj_name: str,
+            num_experts: int) -> List[Tuple[str, str, int, str]]:
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_" if weight_name
+             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
+             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
+            for expert_id in range(num_experts) for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]

From 0f7bec3f03f9b7157f237f4dc9e7550ee5487f5f Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 3 Sep 2024 12:41:52 -0400
Subject: [PATCH 31/46] Add GPTQMarlinMoEMethod to gptq_marlin.py

---
 .../layers/quantization/gptq_marlin.py        | 304 +++++++++++++++++-
 1 file changed, 289 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index b06ff7bd2bac..aac84b4586a8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,18 +1,25 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase,
+                                                        GPTQFusedMoE)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
-    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
+    marlin_permute_scales, marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx, replace_tensor, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -33,8 +40,14 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -105,11 +118,14 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
+        elif isinstance(layer, GPTQFusedMoE):
+            return GPTQMarlinMoEMethod(self)
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -179,7 +195,8 @@ def create_weights(
             output_size_per_partition=output_size_per_partition,
             input_size_per_partition=input_size_per_partition,
             input_size=input_size,
-            group_size=group_size)
+            group_size=group_size,
+        )
 
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
@@ -299,7 +316,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -308,7 +326,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=(layer.input_size if self.quant_config.desc_act else
                     layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
+            group_size=self.quant_config.group_size,
+        )
         replace_tensor(layer, "scales", marlin_scales)
 
     def apply(
@@ -329,4 +348,259 @@ def apply(
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,
-            bias=bias)
+            bias=bias,
+        )
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_tensor(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_tensor(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_tensor(layer, "w13_g_idx_sort_indices",
+                           w13_g_idx_sort_indices)
+            replace_tensor(layer, "w2_g_idx_sort_indices",
+                           w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_tensor(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=(layer.intermediate_size if self.quant_config.desc_act else
+                    layer.intermediate_size_per_partition),
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_tensor(layer, "w2_scales", marlin_w2_scales)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=None)
+
+        return fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            router_logits,
+            layer.w13_g_idx,
+            layer.w2_g_idx,
+            layer.w13_g_idx_sort_indices,
+            layer.w2_g_idx_sort_indices,
+            topk_weights,
+            topk_ids,
+            renormalize=renormalize,
+            w1_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            num_bits=self.quant_config.quant_type.size_bits,
+        )

From cb0001e1ca3f4637f6629925dbca15d361e048bb Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 09:00:15 -0400
Subject: [PATCH 32/46] Use FusedMoE layer for all loads

---
 .../layers/fused_moe/__init__.py              |   3 +-
 vllm/model_executor/layers/fused_moe/layer.py | 172 ++----------------
 .../layers/quantization/gptq_marlin.py        |   5 +-
 3 files changed, 22 insertions(+), 158 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 7f27e2660db6..e9b5703ca28b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,12 +1,11 @@
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, GPTQFusedMoE)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
 __all__ = [
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "GPTQFusedMoE",
 ]
 
 if HAS_TRITON:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 9643642b9b53..b0d7d4b538df 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -334,6 +334,25 @@ def weight_loader(self, param: torch.nn.Parameter,
             loaded_weight = loaded_weight.t().contiguous()
             shard_dim = ~shard_dim
 
+        # GPTQ Values
+        if ("scales" in weight_name or "qweight" in weight_name
+                or "qzeros" in weight_name):
+            if (shard_id == "w1" or shard_id == "w3"):
+                shard_dim = 1 - shard_dim
+            self._load_model_weight_or_group_weight_scale(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank)
+            return
+
+        if "g_idx" in weight_name:
+            self._load_single_value(param=param,
+                                    loaded_weight=loaded_weight,
+                                    expert_id=expert_id)
+            return
+
         # Case weight_scales
         if "weight_scale" in weight_name:
             # load the weight scaling based on the quantization scheme
@@ -499,156 +518,3 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
             # If we are in the row parallel case (down_proj)
             else:
                 param_data[expert_id] = loaded_weight
-
-
-class GPTQFusedMoE(torch.nn.Module):
-    """GPTQFusedMoE layer for GPTQ MoE models.
-    This layer contains both MergedColumnParallel weights (gate_up_proj / 
-    w13) and RowParallelLinear weights (down_proj/ w2).
-    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
-    copy that naming convention here and handle any remapping in the
-    load_weights function in each model implementation.
-    Args:
-        num_experts: Number of experts in the model
-        top_k: Number of experts selected for each token
-        hidden_size: Input hidden state size of the transformer
-        intermediate_size: Intermediate size of the experts
-        params_dtype: Data type for the parameters.
-        reduce_results: Whether to all all_reduce on the output of the layer
-        renomalize: Whether to renormalize the logits in the fused_moe kernel
-        quant_config: Quantization configure.
-    """
-
-    def __init__(
-        self,
-        num_experts: int,
-        top_k: int,
-        hidden_size: int,
-        intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        reduce_results: bool = False,
-        renormalize: bool = True,
-        use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
-        topk_group: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-
-        self.tp_size = (tp_size if tp_size is not None else
-                        get_tensor_model_parallel_world_size())
-        self.top_k = top_k
-        self.num_experts = num_experts
-        self.intermediate_size = intermediate_size
-        self.intermediate_size_per_partition = intermediate_size // self.tp_size
-        self.reduce_results = reduce_results
-        self.renormalize = renormalize
-        assert (not use_grouped_topk and num_expert_group is None
-                and topk_group is None)
-
-        if quant_config is None:
-            self.quant_method: Optional[
-                QuantizeMethodBase] = UnquantizedFusedMoEMethod()
-        else:
-            self.quant_method = quant_config.get_quant_method(self, prefix)
-        assert self.quant_method is not None
-
-        self.quant_method.create_weights(
-            layer=self,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=self.intermediate_size_per_partition,
-            params_dtype=params_dtype,
-            weight_loader=self.weight_loader,
-        )
-
-    def weight_loader(self, param: torch.nn.Parameter,
-                      loaded_weight: torch.Tensor, weight_name: str,
-                      shard_id: str, expert_id: int) -> None:
-
-        if ("_qweight" in weight_name or "_scales" in weight_name
-                or "_qzeros" in weight_name):
-            if "w13" in weight_name:
-                shard_size = loaded_weight.size()[-1]
-                if shard_id == "w1":
-                    param.data[expert_id, :, :shard_size] = loaded_weight
-                elif shard_id == "w2" or shard_id == "w3":
-                    param.data[expert_id, :, shard_size:] = loaded_weight
-                else:
-                    raise ValueError(f"Invalid shard_id: {shard_id}: "
-                                     "must be w1, w2, or w3.")
-            elif "w2" in weight_name:
-                param.data[expert_id][:] = loaded_weight
-            else:
-                raise ValueError(f"Invalid weight name: {weight_name}: "
-                                 "must contain 'w13' or 'w2'.")
-        elif "_g_idx" in weight_name:
-            if "w13" not in weight_name and "w2" not in weight_name:
-                raise ValueError(f"Invalid weight name: {weight_name}: "
-                                 "must contain 'w13' or 'w2'.")
-            param.data[expert_id] = loaded_weight
-        else:
-            raise ValueError(f"Invalid weight name: {weight_name}.")
-
-    @staticmethod
-    def select_experts(hidden_states: torch.Tensor,
-                       router_logits: torch.Tensor,
-                       top_k: int,
-                       use_grouped_topk: bool,
-                       renormalize: bool,
-                       topk_group: Optional[int] = None,
-                       num_expert_group: Optional[int] = None):
-        assert (not use_grouped_topk and topk_group is None
-                and num_expert_group is None)
-        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-
-        topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
-                                            gating_output=router_logits,
-                                            topk=top_k,
-                                            renormalize=renormalize)
-
-        return topk_weights, topk_ids
-
-    def forward(self, hidden_states: torch.Tensor,
-                router_logits: torch.Tensor):
-        assert self.quant_method is not None
-
-        # Matrix multiply.
-        final_hidden_states = self.quant_method.apply(
-            layer=self,
-            x=hidden_states,
-            router_logits=router_logits,
-            top_k=self.top_k,
-            renormalize=self.renormalize,
-            use_grouped_topk=False,
-            topk_group=False,
-            num_expert_group=False)
-
-        if self.reduce_results and self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states
-
-    @classmethod
-    def make_expert_params_mapping(
-            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
-            ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, str]]:
-
-        return [
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_" if weight_name
-             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
-             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
-            for expert_id in range(num_experts) for shard_id, weight_name in [
-                ("w1", ckpt_gate_proj_name),
-                ("w2", ckpt_down_proj_name),
-                ("w3", ckpt_up_proj_name),
-            ]
-        ]
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index aac84b4586a8..698a4c29d7a0 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -8,8 +8,7 @@
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
     fused_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase,
-                                                        GPTQFusedMoE)
+                                                        FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -124,7 +123,7 @@ def get_quant_method(
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
             return GPTQMarlinLinearMethod(self)
-        elif isinstance(layer, GPTQFusedMoE):
+        elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
         return None
 

From 33090a3f93c07e302cc6ef5960f4cad723f808c1 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 11:25:08 -0400
Subject: [PATCH 33/46] Make sure that GPTQ runs through mixtral.py

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 6 +++---
 vllm/model_executor/model_loader/utils.py              | 2 +-
 vllm/model_executor/models/mixtral.py                  | 6 ++++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 698a4c29d7a0..b0f972182d59 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Parameter
@@ -551,8 +551,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
-            size_k=(layer.intermediate_size if self.quant_config.desc_act else
-                    layer.intermediate_size_per_partition),
+            size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
         )
@@ -575,6 +574,7 @@ def apply(
         use_grouped_topk: bool = False,
         num_expert_group: Optional[int] = None,
         topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
 
         topk_weights, topk_ids = FusedMoE.select_experts(
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 4bb943ab3afe..d247e4cf3f07 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -23,7 +23,7 @@ def get_model_architecture(
     architectures = getattr(model_config.hf_config, "architectures", [])
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = ["fp8", "compressed-tensors"]
+    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin"]
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported
             and "MixtralForCausalLM" in architectures):
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e744e36ac08b..6413b56605ec 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -435,7 +435,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
+                if name.endswith("bias") and name not in params_dict:
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -454,6 +454,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
+                    if name.endswith("bias") and name not in params_dict:
+                        continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param,
@@ -464,7 +466,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
+                    if name.endswith("bias") and name not in params_dict:
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):

From d4798373c1b861aee79d665fbe8a56d945da9a42 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 12:40:52 -0400
Subject: [PATCH 34/46] enforce float16A/scales for marlin moe

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++--
 vllm/model_executor/models/mixtral.py                  | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index b0f972182d59..b53267c0bd06 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -404,7 +404,7 @@ def create_weights(
             torch.empty(num_experts,
                         scales_size13,
                         2 * intermediate_size,
-                        dtype=params_dtype),
+                        dtype=torch.half),
             requires_grad=False,
         )
         layer.register_parameter("w13_scales", w13_scales)
@@ -414,7 +414,7 @@ def create_weights(
             torch.empty(num_experts,
                         scales_size2,
                         hidden_size,
-                        dtype=params_dtype),
+                        dtype=torch.half),
             requires_grad=False,
         )
         layer.register_parameter("w2_scales", w2_scales)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6413b56605ec..148ef393277e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -95,11 +95,12 @@ def __init__(self,
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
+        orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape)
+        final_hidden_states = self.experts(hidden_states.half(), router_logits)
+        return final_hidden_states.view(orig_shape).to(orig_dtype)
 
 
 class MixtralAttention(nn.Module):

From 8baaec644b2468e263f14022b01c8b55d3893ad6 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Wed, 4 Sep 2024 15:28:23 +0000
Subject: [PATCH 35/46] remove large model

---
 tests/weight_loading/models.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 5eee2cc53444..1dc529037a98 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -21,7 +21,6 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main

From 8fbc181dfa2747a8d5dbf03ef207c9b163a68c75 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 13:10:02 -0400
Subject: [PATCH 36/46] Cleanup, comments

---
 csrc/moe/marlin_moe_ops.cu                    |  4 +-
 tests/kernels/test_moe.py                     |  1 -
 .../layers/fused_moe/__init__.py              |  8 +--
 .../layers/fused_moe/fused_marlin_moe.py      | 50 ++++++++-----------
 .../compressed_tensors_moe.py                 |  1 -
 5 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index e3c18ce5a50b..f6d475a56851 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -1228,8 +1228,6 @@ __device__ inline void MarlinMoESingle(
     if (slice_iters == 0) {
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
         if constexpr (w_type.size_bits() == 8) {
           if (s_sh_wr_pred) {
@@ -1237,6 +1235,8 @@ __device__ inline void MarlinMoESingle(
           }
           cp_async_fence();
         } else {
+          // For 4-bit per-column scales, we only fetch them here in the
+          // final step before write-out
           if (last) {
             if (s_sh_wr_pred) {
               cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 606997843982..7e359ff08088 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -241,7 +241,6 @@ def test_fused_marlin_moe(
         sort_indices2,
         topk_weights,
         topk_ids,
-        renormalize=False,
         w1_scale=scales1,
         w2_scale=scales2,
         num_bits=num_bits,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e9b5703ca28b..dea4a32aec4f 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,3 +1,5 @@
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -6,18 +8,16 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "fused_marlin_moe",
+    "single_marlin_moe",
 ]
 
 if HAS_TRITON:
-    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
 
     __all__ += [
-        "fused_marlin_moe",
-        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 5866c83cd9c8..c7906205760f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,6 +1,6 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Dict, Optional
 
 import torch
 
@@ -16,11 +16,10 @@ def single_marlin_moe(
     scales: torch.Tensor,
     gating_output: torch.Tensor,
     g_idx: torch.Tensor,
-    rand_perm: torch.Tensor,
+    perm: torch.Tensor,
     topk: int,
     renormalize: bool,
     override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
     num_bits: int = 8,
 ) -> torch.Tensor:
     """
@@ -28,18 +27,18 @@ def single_marlin_moe(
     and top-k gating mechanism. It is meant for testing and debugging.
 
     Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-    - w (torch.Tensor): The first set of expert weights.
+    - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
+    - w (torch.Tensor): The set of expert weights.
+    - scales (torch.Tensor): The quantization scales.
     - gating_output (torch.Tensor): The output of the gating operation
         (before softmax).
+    - g_idx (torch.Tensor): The act_order indices.
+    - perm (torch.Tensor): The act_order input permutation.
     - topk (int): The number of top-k experts to select.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        product for w. Defaults to False.
+    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -55,8 +54,6 @@ def single_marlin_moe(
         torch.float32, torch.float16, torch.bfloat16
     ]
     assert num_bits in [4, 8]
-    # TODO support this
-    assert not use_fp8
 
     M, K = hidden_states.shape
     E = w.shape[0]
@@ -70,7 +67,7 @@ def single_marlin_moe(
                                         w.shape,
                                         w.shape,
                                         topk_ids.shape[1],
-                                        "float8" if use_fp8 else None,
+                                        None,
                                         override_config=override_config,
                                         is_marlin=True)
     config = get_config_func(M)
@@ -90,7 +87,7 @@ def single_marlin_moe(
 
     intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, rand_perm, workspace, scalar_type, M, N, K, True, E, topk,
+        g_idx, perm, workspace, scalar_type, M, N, K, True, E, topk,
         block_size_m, True, False)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
@@ -103,14 +100,11 @@ def fused_marlin_moe(
     gating_output: torch.Tensor,
     g_idx1: torch.Tensor,
     g_idx2: torch.Tensor,
-    rand_perm1: torch.Tensor,
-    rand_perm2: torch.Tensor,
+    perm1: torch.Tensor,
+    perm2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    custom_routing_function: Optional[Callable] = None,
-    renormalize: bool = True,
     override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     num_bits: int = 8,
@@ -125,18 +119,20 @@ def fused_marlin_moe(
     - w2 (torch.Tensor): The second set of expert weights.
     - gating_output (torch.Tensor): The output of the gating operation
         (before softmax).
-    - topk (int): The number of top-k experts to select.
+    - g_idx1 (torch.Tensor): The fist set of act_order indices.
+    - g_idx2 (torch.Tensor): The second set of act_order indices.
+    - perm1 (torch.Tensor): The first act_order input permutation.
+    - perm2 (torch.Tensor): The second act_order input permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
-    - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
-        products for w1 and w2. Defaults to False.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
         w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -156,8 +152,6 @@ def fused_marlin_moe(
         torch.float32, torch.float16, torch.bfloat16
     ]
     assert num_bits in [4, 8]
-    # TODO support this
-    assert not use_fp8
 
     M, K = hidden_states.shape
     E = w1.shape[0]
@@ -169,7 +163,7 @@ def fused_marlin_moe(
         w1.shape,
         w2.shape,
         topk_ids.shape[1],
-        "float8" if use_fp8 else None,
+        None,
         override_config=override_config,
         is_marlin=True,
     )
@@ -202,7 +196,7 @@ def fused_marlin_moe(
         topk_ids,
         w1_scale,
         g_idx1,
-        rand_perm1,
+        perm1,
         workspace,
         scalar_type,
         M,
@@ -226,7 +220,7 @@ def fused_marlin_moe(
         topk_ids,
         w2_scale,
         g_idx2,
-        rand_perm2,
+        perm2,
         workspace,
         scalar_type,
         M,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index b14ef433d539..7dee2fca8115 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -293,7 +293,6 @@ def apply(
             layer.w2_g_idx_sort_indices,
             topk_weights,
             topk_ids,
-            renormalize=renormalize,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
             num_bits=self.num_bits,

From 839915f285fcc09dff376b11735e1e828da3e924 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 4 Sep 2024 13:13:32 -0400
Subject: [PATCH 37/46] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index b53267c0bd06..d593298cf2f1 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -598,7 +598,6 @@ def apply(
             layer.w2_g_idx_sort_indices,
             topk_weights,
             topk_ids,
-            renormalize=renormalize,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
             num_bits=self.quant_config.quant_type.size_bits,

From a5bc626e59fd755baf96a65cd6b68b136fd7e2f0 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 6 Sep 2024 03:12:10 -0400
Subject: [PATCH 38/46] remove 8-bit stuff for now

---
 csrc/moe/marlin_moe_ops.cu                    | 303 ++++++------------
 csrc/moe/marlin_moe_ops.h                     |   7 +-
 csrc/moe/torch_bindings.cpp                   |   8 +-
 tests/kernels/test_moe.py                     |  14 +-
 vllm/_custom_ops.py                           |   2 +-
 .../layers/fused_moe/__init__.py              |   8 +-
 .../layers/fused_moe/fused_marlin_moe.py      |  52 +--
 .../compressed_tensors_moe.py                 |   1 -
 .../schemes/compressed_tensors_wNa16.py       |   1 -
 .../layers/quantization/gptq_marlin.py        |   1 -
 10 files changed, 120 insertions(+), 277 deletions(-)

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
index f6d475a56851..92184f43c9eb 100644
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
@@ -25,8 +25,6 @@
 
 #include <iostream>
 
-#include "core/scalar_type.hpp"
-
 template <typename T>
 inline std::string str(T x) {
   return std::to_string(x);
@@ -133,26 +131,11 @@ __device__ inline int lop3(int a, int b, int c) {
   return res;
 }
 
-// Constructs destination register by taking bytes from 2 sources (based on
-// mask)
-template <int start_byte, int mask>
-__device__ inline uint32_t prmt(uint32_t a) {
-  uint32_t res;
-  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
-               : "=r"(res)
-               : "r"(a), "n"(start_byte), "n"(mask));
-  return res;
-}
-
-template <vllm::ScalarTypeId w_type_id>
-__device__ inline FragB dequant(int q);
-
-// Efficiently dequantize 4bit values packed in an int32 value into a full
-// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
-// with some small changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
-template <>
-__device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
   const int LO = 0x000f000f;
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
@@ -173,28 +156,6 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   return frag_b;
 }
 
-// Fast Int8ToFp16: Efficiently dequantize 8bit int values to fp16
-// Reference:
-// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
-template <>
-__device__ inline FragB dequant<vllm::kU8B128.id()>(int q) {
-  static constexpr uint32_t mask_for_elt_01 = 0x5250;
-  static constexpr uint32_t mask_for_elt_23 = 0x5351;
-  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
-
-  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
-  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
-
-  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
-
-  FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
-  return frag_b;
-}
-
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
 __device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
@@ -335,8 +296,7 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   __syncthreads();
 }
 
-template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const int threads,          // number of threads in a threadblock
+template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -371,9 +331,6 @@ __device__ inline void MarlinMoESingle(
     bool apply_weights,    // apply weights to output
     int current_m_block    // current m block to start kernel computation from
 ) {
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
-  constexpr int pack_factor = 32 / w_type.size_bits();
-
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
   // better partitioning with less reductions
   int parallel = 1;
@@ -466,15 +423,12 @@ __device__ inline void MarlinMoESingle(
   constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
-  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
-
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
   int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
-  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
   constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
@@ -511,12 +465,12 @@ __device__ inline void MarlinMoESingle(
       a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
   a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -617,7 +571,7 @@ __device__ inline void MarlinMoESingle(
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2][b_thread_vecs];
+  I4 frag_b_quant[2];
   FragC frag_c[thread_m_blocks][4][2];
   FragS frag_s[2][4];         // No act-order
   FragS act_frag_s[2][4][4];  // For act-order
@@ -683,10 +637,7 @@ __device__ inline void MarlinMoESingle(
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
       for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
         B_ptr[i] += b_gl_rd_delta_o;
       }
 
@@ -764,12 +715,8 @@ __device__ inline void MarlinMoESingle(
     for (int i = 0; i < thread_m_blocks; i++)
       ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-
-  #pragma unroll
-    for (int i = 0; i < b_thread_vecs; i++) {
-      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
-    }
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
   };
 
   bool is_same_group[stages];
@@ -893,19 +840,10 @@ __device__ inline void MarlinMoESingle(
   // dequantization and matmul operations.
   #pragma unroll
     for (int j = 0; j < 4; j++) {
-      int b_quant_0, b_quant_1;
-      if constexpr (w_type.size_bits() == 4) {
-        b_quant_0 = frag_b_quant[k % 2][0][j];
-        b_quant_1 = b_quant_0 >> 8;
-      } else {
-        static_assert(w_type.size_bits() == 8);
-        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
-        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
-        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
-      }
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
 
-      FragB frag_b0 = dequant<w_type_id>(b_quant_0);
-      FragB frag_b1 = dequant<w_type_id>(b_quant_1);
+      FragB frag_b0 = dequant(b_quant);
 
       // Apply scale to frag_b0
       if constexpr (has_act_order) {
@@ -917,6 +855,8 @@ __device__ inline void MarlinMoESingle(
         }
       }
 
+      FragB frag_b1 = dequant(b_quant_shift);
+
       // Apply scale to frag_b1
       if constexpr (has_act_order) {
         scale4(frag_b1, act_frag_s[k % 2][0][j], act_frag_s[k % 2][1][j],
@@ -941,13 +881,13 @@ __device__ inline void MarlinMoESingle(
   // multiple warps that accumulate their partial sums of the same output
   // location; which we have to reduce over in the end. We do in shared memory.
   auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    constexpr int red_off = threads / b_sh_stride / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride_threads;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
-                      (threadIdx.x % b_sh_stride_threads);
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
 
       // Parallel logarithmic shared memory reduction. We make sure to avoid any
       // unnecessary read or write iterations, e.g., for two warps we write only
@@ -1095,10 +1035,8 @@ __device__ inline void MarlinMoESingle(
     auto write = [&](int idx, float c0, float c1, FragS& s) {
       half2 res = __halves2half2(__float2half(c0), __float2half(c1));
 
-      // For per-column quantization we finally apply the scale here (only for
-      // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4) {
+      // For per-column quantization we finally apply the scale here
+      if constexpr (!has_act_order && group_blocks == -1) {
         res = __hmul2(res, s[0]);
       }
 
@@ -1228,70 +1166,28 @@ __device__ inline void MarlinMoESingle(
     if (slice_iters == 0) {
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
       if constexpr (!has_act_order && group_blocks == -1) {
-        if constexpr (w_type.size_bits() == 8) {
+        if (last) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
           cp_async_fence();
-        } else {
-          // For 4-bit per-column scales, we only fetch them here in the
-          // final step before write-out
-          if (last) {
-            if (s_sh_wr_pred) {
-              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
-            }
-            cp_async_fence();
-          }
         }
       }
 
       thread_block_reduce();
       if constexpr (!has_act_order && group_blocks == -1) {
-        if constexpr (w_type.size_bits() == 8) {
+        if (last) {
           cp_async_wait<0>();
           __syncthreads();
           if (threadIdx.x / 32 < thread_n_blocks / 4) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
           }
-
-        } else {
-          if (last) {
-            cp_async_wait<0>();
-            __syncthreads();
-            if (threadIdx.x / 32 < thread_n_blocks / 4) {
-              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-            }
-          }
         }
       }
-
-      // For 8-bit channelwise, we apply the scale before the global reduction
-      // that converts the fp32 results to fp16 (so that we avoid possible
-      // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-          for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][0]),
-                          frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][0][2]),
-                          frag_s[j / 2][2 * (j % 2) + 0]);
-
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][0]),
-                          frag_s[j / 2][2 * (j % 2) + 1]);
-              scale_float(reinterpret_cast<float*>(&frag_c[i][j][1][2]),
-                          frag_s[j / 2][2 * (j % 2) + 1]);
-            }
-          }
-        }
-      }
-
       if (slice_count > 1) {  // only globally reduce if there is more than one
                               // block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
@@ -1331,8 +1227,7 @@ __device__ inline void MarlinMoESingle(
   }
 }
 
-template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const int threads,          // number of threads in a threadblock
+template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1398,29 +1293,29 @@ __global__ void MarlinMoE(
   }
 
   if (max_block == 1) {
-    MarlinMoESingle<w_type_id, threads, 1, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 1, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 2) {
-    MarlinMoESingle<w_type_id, threads, 2, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 2, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else if (max_block == 3) {
-    MarlinMoESingle<w_type_id, threads, 3, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 3, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
         current_m_block);
   } else {
-    MarlinMoESingle<w_type_id, threads, 4, thread_n_blocks, thread_k_blocks,
-                    stages, has_act_order, group_blocks>(
+    MarlinMoESingle<threads, 4, thread_n_blocks, thread_k_blocks, stages,
+                    has_act_order, group_blocks>(
         A, B, C, sorted_ids_expert, topk_weights, scales_ptr, g_idx,
         expert_offsets, num_groups, expert_idx, num_experts, topk, prob_m,
         prob_n, prob_k, tot_m, locks, replicate_input, apply_weights,
@@ -1447,8 +1342,7 @@ __global__ void compute_expert_offsets(int const* __restrict__ topk_ids,
   return;
 }
 
-template <const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const int threads,          // number of threads in a threadblock
+template <const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
                                       // threadblock
@@ -1503,20 +1397,19 @@ const int STAGES = 4;  // 4 pipeline stages fit into shared memory
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 
-#define __CALL_IF_MOE(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
-                      THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS,           \
-                      NUM_THREADS)                                            \
-  else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&          \
+#define __CALL_IF_MOE(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,      \
+                      HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS)               \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                              \
            thread_n_blocks == THREAD_N_BLOCKS &&                              \
            thread_k_blocks == THREAD_K_BLOCKS &&                              \
            has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS &&  \
            num_threads == NUM_THREADS) {                                      \
     cudaFuncSetAttribute(                                                     \
-        MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+        MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,              \
                   THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>,      \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
-    MarlinMoE<W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,     \
-              THREAD_K_BLOCKS, STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>           \
+    MarlinMoE<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
+              STAGES, HAS_ACT_ORDER, GROUP_BLOCKS>                            \
         <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                    \
             A_ptr, B_ptr, C_ptr, sorted_ids_ptr, topk_weights_ptr, s_ptr,     \
             g_idx_ptr, expert_offsets_ptr, num_groups, expert_idx,            \
@@ -1601,43 +1494,42 @@ thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
   return thread_config_t{-1, -1, -1};
 }
 
-#define CALL_IF_MOE(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
-                                                                       \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
-  __CALL_IF_MOE(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+#define CALL_IF_MOE(N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                               \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                               \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+  __CALL_IF_MOE(4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
 
 void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
                          const void* sorted_ids, const void* topk_weights,
                          const void* topk_ids, const void* s, const void* g_idx,
                          const void* perm, void* a_tmp, void* expert_offsets,
                          int prob_m, int prob_n, int prob_k, void* workspace,
-                         vllm::ScalarType const& q_type, bool has_act_order,
-                         bool is_k_full, int num_groups, int group_size,
-                         int num_experts, int topk, int moe_block_size, int dev,
-                         cudaStream_t stream, int thread_k, int thread_n,
-                         int sms, int max_par, bool replicate_input,
-                         bool apply_weights) {
+                         bool has_act_order, bool is_k_full, int num_groups,
+                         int group_size, int num_experts, int topk,
+                         int moe_block_size, int dev, cudaStream_t stream,
+                         int thread_k, int thread_n, int sms, int max_par,
+                         bool replicate_input, bool apply_weights) {
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -1719,13 +1611,10 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
     has_act_order = false;
   }
 
-  int pack_factor = 32 / q_type.size_bits();
-
   for (int expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
     const int4* A_ptr = (const int4*)A;
     int4* a_tmp_ptr = (int4*)a_tmp;
-    const int4* B_ptr =
-        (const int4*)B + (prob_n * prob_k / (pack_factor * 4)) * expert_idx;
+    const int4* B_ptr = (const int4*)B + (prob_n * prob_k / 32) * expert_idx;
     int4* C_ptr = (int4*)C;
     const float* topk_weights_ptr = (const float*)topk_weights;
     const int* sorted_ids_ptr = (const int*)sorted_ids;
@@ -1756,14 +1645,10 @@ void marlin_mm_moe_f16i4(const void* A, const void* B, void* C,
 
       if (false) {
       }
-      CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
-      CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
-      CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
-      CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
-      CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
-      CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
-      CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
-      CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
+      CALL_IF_MOE(16, 4, 256)
+      CALL_IF_MOE(8, 8, 256)
+      CALL_IF_MOE(8, 4, 128)
+      CALL_IF_MOE(4, 8, 128)
       else {
         TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
                                str(prob_n) + ", " + str(prob_k) + "]" +
@@ -1785,15 +1670,9 @@ torch::Tensor marlin_gemm_moe(
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
-    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
-    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights) {
-  TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
-              "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
-
-  int pack_factor = 32 / b_q_type->size_bits();
-
   int max_par = 4;
 
   int dev = a.get_device();
@@ -1854,8 +1733,8 @@ torch::Tensor marlin_gemm_moe(
       topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
       g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
       expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
-      *b_q_type, has_act_order, is_k_full, num_groups, group_size, num_experts,
-      topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      has_act_order, is_k_full, num_groups, group_size, num_experts, topk,
+      moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
       thread_n, sms, max_par, replicate_input, apply_weights);
   return c;
 }
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
index adee8399a4d6..43d264e0770d 100644
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@@ -2,14 +2,11 @@
 
 #include <torch/all.h>
 
-#include "core/scalar_type.hpp"
-
 torch::Tensor marlin_gemm_moe(
     const torch::Tensor& a, const torch::Tensor& b_q_weights,
     const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
     const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
     const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
-    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
-    int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
     bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index cd65a8ee92b9..8a0e625b43fa 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -13,11 +13,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, Tensor! workspace, "
-      "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
-      "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
-      "int moe_block_size, bool replicate_input, bool apply_weights)"
-      " -> Tensor");
+      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
+      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
+      "bool replicate_input, bool apply_weights) -> Tensor");
   m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 7e359ff08088..2250cf1598b8 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -140,7 +140,6 @@ def compute_max_diff(output, output_ref):
 @pytest.mark.parametrize("topk", [2, 6])
 @pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
 @pytest.mark.parametrize("act_order", [True, False])
-@pytest.mark.parametrize("num_bits", [4, 8])
 def test_fused_marlin_moe(
     m: int,
     n: int,
@@ -149,7 +148,6 @@ def test_fused_marlin_moe(
     topk: int,
     group_size: int,
     act_order: bool,
-    num_bits: int,
 ):
     torch.manual_seed(7)
 
@@ -163,8 +161,7 @@ def test_fused_marlin_moe(
         if group_size in (k, n):
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
+    quant_type = scalar_types.uint4b8
     dtype = torch.float16
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -243,7 +240,6 @@ def test_fused_marlin_moe(
         topk_ids,
         w1_scale=scales1,
         w2_scale=scales2,
-        num_bits=num_bits,
     )
 
     assert compute_max_diff(marlin_output, triton_output) < 4e-2
@@ -258,7 +254,6 @@ def test_fused_marlin_moe(
 @pytest.mark.parametrize("topk", [2, 6])
 @pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
 @pytest.mark.parametrize("act_order", [True, False])
-@pytest.mark.parametrize("num_bits", [4, 8])
 def test_marlin_moe_mmm(
     m: int,
     n: int,
@@ -267,7 +262,6 @@ def test_marlin_moe_mmm(
     topk: int,
     group_size: int,
     act_order: bool,
-    num_bits: int,
 ):
     if topk > e:
         return
@@ -279,8 +273,7 @@ def test_marlin_moe_mmm(
         if group_size == k:
             return
 
-    quant_type = (scalar_types.uint4b8
-                  if num_bits == 4 else scalar_types.uint8b128)
+    quant_type = scalar_types.uint4b8
     dtype = torch.float16
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
@@ -315,8 +308,7 @@ def test_marlin_moe_mmm(
                                       g_idx,
                                       sort_indices,
                                       topk,
-                                      renormalize=False,
-                                      num_bits=num_bits)
+                                      renormalize=False)
     torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
 
     assert compute_max_diff(marlin_output, torch_output) < 1e-2
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 77c46584ef53..151cdbee8eb0 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -308,7 +308,7 @@ def gptq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                            num_bits: int) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
-    output = torch.empty((num_experts, size_k // 16, size_n * (num_bits // 2)),
+    output = torch.empty((num_experts, size_k // 16, size_n * 2),
                          device=b_q_weight.device,
                          dtype=b_q_weight.dtype)
     for e in range(num_experts):
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index dea4a32aec4f..e9b5703ca28b 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,3 @@
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    fused_marlin_moe, single_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -8,16 +6,18 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "fused_marlin_moe",
-    "single_marlin_moe",
 ]
 
 if HAS_TRITON:
+    from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+        fused_marlin_moe, single_marlin_moe)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
 
     __all__ += [
+        "fused_marlin_moe",
+        "single_marlin_moe",
         "fused_moe",
         "fused_topk",
         "fused_experts",
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index c7906205760f..6b01ec0a623a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -7,21 +7,18 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
-from vllm.scalar_type import scalar_types
 
 
 def single_marlin_moe(
-    hidden_states: torch.Tensor,
-    w: torch.Tensor,
-    scales: torch.Tensor,
-    gating_output: torch.Tensor,
-    g_idx: torch.Tensor,
-    perm: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    override_config: Optional[Dict[str, Any]] = None,
-    num_bits: int = 8,
-) -> torch.Tensor:
+        hidden_states: torch.Tensor,
+        w: torch.Tensor,
+        scales: torch.Tensor,
+        gating_output: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        override_config: Optional[Dict[str, Any]] = None) -> torch.Tensor:
     """
     This function computes a Marlin MoE MMM using weights w
     and top-k gating mechanism. It is meant for testing and debugging.
@@ -38,7 +35,6 @@ def single_marlin_moe(
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
     - override_config (Optional[Dict[str, Any]]): Optional override
         for the kernel configuration.
-    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -50,14 +46,11 @@ def single_marlin_moe(
     assert gating_output.shape[1] == w.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w.is_contiguous(), "Expert weights must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    assert num_bits in [4, 8]
+    assert hidden_states.dtype == torch.float16
 
     M, K = hidden_states.shape
     E = w.shape[0]
-    N = w.shape[2] // (num_bits // 2)
+    N = w.shape[2] // 2
 
     topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
                                         renormalize)
@@ -82,13 +75,10 @@ def single_marlin_moe(
                             device="cuda",
                             requires_grad=False)
 
-    scalar_type = (scalar_types.uint4b8
-                   if num_bits == 4 else scalar_types.uint8b128)
-
     intermediate_cache = torch.ops._moe_C.marlin_gemm_moe(
         hidden_states, w, sorted_token_ids, topk_weights, topk_ids, scales,
-        g_idx, perm, workspace, scalar_type, M, N, K, True, E, topk,
-        block_size_m, True, False)
+        g_idx, perm, workspace, M, N, K, True, E, topk, block_size_m, True,
+        False)
 
     return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1)
 
@@ -107,7 +97,6 @@ def fused_marlin_moe(
     override_config: Optional[Dict[str, Any]] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
-    num_bits: int = 8,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -132,7 +121,6 @@ def fused_marlin_moe(
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
         w2.
-    - num_bits (bool): The number of bits in expert weights quantization.
 
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -142,16 +130,13 @@ def fused_marlin_moe(
         0], "Number of tokens mismatch"
     assert hidden_states.shape[
         1] == w1.shape[1] * 16, "Hidden size mismatch w1"
-    assert hidden_states.shape[1] == w2.shape[2] // (
-        num_bits // 2), "Hidden size mismatch w2"
+    assert hidden_states.shape[
+        1] == w2.shape[2] // 2, "Hidden size mismatch w2"
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    assert num_bits in [4, 8]
+    assert hidden_states.dtype == torch.float16
 
     M, K = hidden_states.shape
     E = w1.shape[0]
@@ -179,9 +164,6 @@ def fused_marlin_moe(
                             device="cuda",
                             requires_grad=False)
 
-    scalar_type = (scalar_types.uint4b8
-                   if num_bits == 4 else scalar_types.uint8b128)
-
     intermediate_cache2 = torch.empty(
         (M * topk_ids.shape[1], N),
         device=hidden_states.device,
@@ -198,7 +180,6 @@ def fused_marlin_moe(
         g_idx1,
         perm1,
         workspace,
-        scalar_type,
         M,
         2 * N,
         K,
@@ -222,7 +203,6 @@ def fused_marlin_moe(
         g_idx2,
         perm2,
         workspace,
-        scalar_type,
         M,
         K,
         N,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 7dee2fca8115..f8a41dfd08d7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -295,5 +295,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
-            num_bits=self.num_bits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 7ca8eecb9283..e3b74e871290 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -18,7 +18,6 @@
 __all__ = ["CompressedTensorsWNA16"]
 WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
-    8: scalar_types.uint8b128,
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index d593298cf2f1..15c0a570c4ca 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -600,5 +600,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
-            num_bits=self.quant_config.quant_type.size_bits,
         )

From c573fa1b084e789dd4821fa020efac84f5574a17 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Thu, 5 Sep 2024 21:07:45 +0000
Subject: [PATCH 39/46] update/fix weight loading to support tp

---
 vllm/model_executor/layers/fused_moe/layer.py | 80 ++++++++++---------
 .../layers/quantization/gptq_marlin.py        | 11 ++-
 2 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b0d7d4b538df..f4621e5c4ccc 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -306,10 +306,28 @@ def _load_single_value(self, param: torch.nn.Parameter,
         # Input scales can be loaded directly and should be equal.
         param_data[expert_id] = loaded_weight
 
+    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
+                    shard_dim: int, loaded_weight: torch.tensor, tp_rank: int):
+
+        if shard_id == "w2":
+            self._load_w2(shard_id=shard_id,
+                          shard_dim=shard_dim,
+                          loaded_weight=loaded_weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank)
+        else:
+            assert shard_id in ("w1", "w3")
+            expert_data.copy_(loaded_weight)
+
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: str, expert_id: int) -> None:
 
+        # llm-compressor returns weights on disk which are flipped
+        loaded_weight = loaded_weight.t().contiguous() if (
+            self.quant_method.__class__.__name__
+            == "CompressedTensorsMoEMethod") else loaded_weight
+
         if shard_id not in ("w1", "w2", "w3"):
             raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                              f"got {shard_id}.")
@@ -325,38 +343,41 @@ def weight_loader(self, param: torch.nn.Parameter,
         expert_data = param.data[expert_id]
         tp_rank = get_tensor_model_parallel_rank()
 
-        # is_transposed: whether or not the parameter is transposed on disk
-        # If transposed, the loaded weight will be transposed and the dim
-        # to shard the loaded weight will be flipped.
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size is
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
         if is_transposed:
-            loaded_weight = loaded_weight.t().contiguous()
             shard_dim = ~shard_dim
 
-        # GPTQ Values
-        if ("scales" in weight_name or "qweight" in weight_name
-                or "qzeros" in weight_name):
-            if (shard_id == "w1" or shard_id == "w3"):
-                shard_dim = 1 - shard_dim
-            self._load_model_weight_or_group_weight_scale(
-                shard_id=shard_id,
-                shard_dim=shard_dim,
-                loaded_weight=loaded_weight,
-                expert_data=expert_data,
-                tp_rank=tp_rank)
-            return
+        # Case input scale: input_scale loading is only supported for fp8
+        if "input_scale" in weight_name:
+            if param.data[expert_id] != 1 and (param.data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param.data[expert_id]} "
+                    f"vs. {loaded_weight}")
 
-        if "g_idx" in weight_name:
             self._load_single_value(param=param,
                                     loaded_weight=loaded_weight,
                                     expert_id=expert_id)
             return
 
-        # Case weight_scales
-        if "weight_scale" in weight_name:
-            # load the weight scaling based on the quantization scheme
-            # supported weight scales can be found in
+        # Case g_idx
+        if "g_idx" in weight_name:
+            self._load_g_idx(shard_dim=0,
+                             shard_id=shard_id,
+                             loaded_weight=loaded_weight,
+                             expert_data=expert_data,
+                             tp_rank=tp_rank)
+            return
+
+        # Case weight scales and zero_points
+        if ("scale" in weight_name or "zero" in weight_name):
+            # load the weight scales and zp based on the quantization scheme
+            # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported
             # TODO @dsikka: once hardened, refactor to use vLLM Parameters
             # specific to each case
@@ -385,22 +406,9 @@ def weight_loader(self, param: torch.nn.Parameter,
                     f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
             return
 
+        # Case weight_shape
         if "weight_shape" in weight_name:
-            self._load_single_value(param=param,
-                                    loaded_weight=loaded_weight,
-                                    expert_id=expert_id)
-            return
-
-        # Case input scale
-        if "input_scale" in weight_name:
-            # Note: input_scale loading is only supported for fp8
-            if param.data[expert_id] != 1 and (param.data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param.data[expert_id]} "
-                    f"vs. {loaded_weight}")
-
+            # only required by compressed-tensors
             self._load_single_value(param=param,
                                     loaded_weight=loaded_weight,
                                     expert_id=expert_id)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 15c0a570c4ca..0a470f311c74 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -7,8 +7,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
     fused_marlin_moe)
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
-                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -372,9 +372,16 @@ def create_weights(
         if self.quant_config.group_size != -1:
             scales_size13 = hidden_size // self.quant_config.group_size
             scales_size2 = intermediate_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
         else:
             scales_size13 = 1
             scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": True
+        })
         # Fused gate_up_proj (column parallel)
         w13_qweight = torch.nn.Parameter(
             torch.empty(

From a991d828a6c688eeb3d87db4cf7651510c447e65 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Sep 2024 15:13:54 +0000
Subject: [PATCH 40/46] fix; update large model testing cases

---
 .buildkite/test-pipeline.yaml                       | 13 ++++++++++++-
 tests/weight_loading/models-large.txt               |  3 +++
 tests/weight_loading/models.txt                     |  2 --
 .../compressed_tensors/compressed_tensors_moe.py    |  7 ++-----
 .../schemes/compressed_tensors_wNa16.py             |  1 +
 5 files changed, 18 insertions(+), 8 deletions(-)
 create mode 100644 tests/weight_loading/models-large.txt

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d0317b2fc48c..a0c7b7442b3b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -386,7 +386,18 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
 
 
 ##### multi gpus test #####
diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt
new file mode 100644
index 000000000000..fe7670574676
--- /dev/null
+++ b/tests/weight_loading/models-large.txt
@@ -0,0 +1,3 @@
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
\ No newline at end of file
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 1dc529037a98..a3e382acf56b 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -19,8 +19,6 @@ compressed-tensors, nm-testing/tinyllama-oneshot-w8a16-per-channel, main
 compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
-compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index f8a41dfd08d7..49c29c2775cb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -6,8 +6,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    WNA16_SUPPORTED_BITS)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat)
 from vllm.model_executor.utils import set_weight_attrs
@@ -40,11 +38,10 @@ def __init__(
 
         if not (self.quant_config.quant_format
                 == CompressionFormat.pack_quantized.value
-                and self.num_bits in WNA16_SUPPORTED_BITS):
+                and self.num_bits == 4):
             raise ValueError("For Fused MoE layers, only ",
                              f"{CompressionFormat.pack_quantized.value} ",
-                             "is supported for the following bits: ",
-                             f"{WNA16_SUPPORTED_BITS}")
+                             "is supported for 4 bits")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index e3b74e871290..cae6ffad53df 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -18,6 +18,7 @@
 __all__ = ["CompressedTensorsWNA16"]
 WNA16_SUPPORTED_TYPES_MAP = {
     4: scalar_types.uint4b8,
+    8: scalar_types.uint8b128
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 

From d57804d96b036a2916cfd872d9e8ea3889442051 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Sep 2024 16:13:55 +0000
Subject: [PATCH 41/46] add hack to support unfused mixtral pathway for int8

---
 vllm/model_executor/model_loader/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index d247e4cf3f07..0052489d99dc 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -23,11 +23,19 @@ def get_model_architecture(
     architectures = getattr(model_config.hf_config, "architectures", [])
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin"]
+    mixtral_supported = ["fp8", "compressed-tensors"]
+    # for gptq_marlin, only run fused MoE for int4
+    if model_config.quantization == "gptq_marlin":
+        hf_quant_config = getattr(model_config.hf_config,
+                                  "quantization_config", None)
+        if hf_quant_config and hf_quant_config.get("bits") == 4:
+            mixtral_supported.append("gptq_marlin")
+
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
+
     return ModelRegistry.resolve_model_cls(architectures)
 
 

From 96fa486336d28e741c392277d06232ec6a0eed17 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 6 Sep 2024 18:29:36 +0000
Subject: [PATCH 42/46] fix install for tpu test

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 0a470f311c74..3bc35dca5d03 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -5,8 +5,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    fused_marlin_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -583,6 +581,8 @@ def apply(
         topk_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            fused_marlin_moe)
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,

From 1faab903a378361275738c04b2dd394067153f20 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Sat, 7 Sep 2024 06:30:32 -0400
Subject: [PATCH 43/46] Move float16 typecast hack to gptq marlin moe method

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 +++
 vllm/model_executor/models/mixtral.py                  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3bc35dca5d03..a01d5fe65538 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -584,6 +584,9 @@ def apply(
         from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             fused_marlin_moe)
 
+        # The input must currently be float16
+        x = x.half()
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 148ef393277e..df7f39097bdc 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -99,7 +99,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states.half(), router_logits)
+        final_hidden_states = self.experts(hidden_states, router_logits)
         return final_hidden_states.view(orig_shape).to(orig_dtype)
 
 

From 970e06a77a02953e43a59c5683891dcd968f4f14 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Sat, 7 Sep 2024 06:58:44 -0400
Subject: [PATCH 44/46] Move output type conversion to gptq method as well

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 vllm/model_executor/models/mixtral.py                  | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a01d5fe65538..3617a32f80fc 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -585,6 +585,7 @@ def apply(
             fused_marlin_moe)
 
         # The input must currently be float16
+        orig_dtype = x.dtype
         x = x.half()
 
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -610,4 +611,4 @@ def apply(
             topk_ids,
             w1_scale=layer.w13_scales,
             w2_scale=layer.w2_scales,
-        )
+        ).to(orig_dtype)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index df7f39097bdc..6413b56605ec 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -95,12 +95,11 @@ def __init__(self,
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
         orig_shape = hidden_states.shape
-        orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
-        return final_hidden_states.view(orig_shape).to(orig_dtype)
+        return final_hidden_states.view(orig_shape)
 
 
 class MixtralAttention(nn.Module):

From fd0a4f2b2f1627330641645e2891ad4655e1f0b5 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 9 Sep 2024 01:48:38 +0000
Subject: [PATCH 45/46] typo fix; fix comment

---
 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py | 2 +-
 vllm/model_executor/layers/fused_moe/layer.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 6b01ec0a623a..3639350d850e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -108,7 +108,7 @@ def fused_marlin_moe(
     - w2 (torch.Tensor): The second set of expert weights.
     - gating_output (torch.Tensor): The output of the gating operation
         (before softmax).
-    - g_idx1 (torch.Tensor): The fist set of act_order indices.
+    - g_idx1 (torch.Tensor): The first set of act_order indices.
     - g_idx2 (torch.Tensor): The second set of act_order indices.
     - perm1 (torch.Tensor): The first act_order input permutation.
     - perm2 (torch.Tensor): The second act_order input permutation.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f4621e5c4ccc..f6c6f5f52940 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -323,7 +323,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: str, expert_id: int) -> None:
 
-        # llm-compressor returns weights on disk which are flipped
+        # compressed-tensors represents weights on disk which are flipped
         loaded_weight = loaded_weight.t().contiguous() if (
             self.quant_method.__class__.__name__
             == "CompressedTensorsMoEMethod") else loaded_weight

From d51a2f43b1a63fb592bfea90f224a10727c3dca3 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 9 Sep 2024 06:56:20 -0400
Subject: [PATCH 46/46] Clarify comment, change how we process bias

---
 vllm/model_executor/layers/fused_moe/fused_marlin_moe.py | 5 +++--
 vllm/model_executor/models/mixtral.py                    | 9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 3639350d850e..200a6148978a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -20,8 +20,9 @@ def single_marlin_moe(
         renormalize: bool,
         override_config: Optional[Dict[str, Any]] = None) -> torch.Tensor:
     """
-    This function computes a Marlin MoE MMM using weights w
-    and top-k gating mechanism. It is meant for testing and debugging.
+    This function computes the multiplication of hidden_states with expert
+    weights used in Marlin MoE, using weights w and top-k gating mechanism.
+    Its purpose is testing and debugging the fused MoE kernel.
 
     Parameters:
     - hidden_states (torch.Tensor): The input tensor to the Marlin Mul.
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6413b56605ec..10cbfcf6432b 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -435,7 +435,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
-                if name.endswith("bias") and name not in params_dict:
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
                     continue
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
@@ -454,7 +455,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):
                         continue
-                    if name.endswith("bias") and name not in params_dict:
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
                         continue
                     param = params_dict[name]
                     weight_loader = param.weight_loader
@@ -466,7 +468,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
-                    if name.endswith("bias") and name not in params_dict:
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
                         continue
                     # Skip layers on other devices.
                     if is_pp_missing_parameter(name, self):