[feat] support bf16 cp_gather_indexer_k_cache kernel

Xin Li · leex404 · commit ab313124b1f0 · 2025-11-04T16:27:07.000+08:00
Signed-off-by: Xin Li &lt;xin.li@metax-tech.com&gt;

Signed-off-by: Xin Li &lt;lixin1620@gmail.com&gt;
diff --git a/csrc/cache.h b/csrc/cache.h
@@ -70,3 +70,18 @@ void indexer_k_quant_and_cache(
     torch::Tensor& slot_mapping,  // [num_tokens]
     int64_t quant_block_size,     // quantization block size
     const std::string& scale_fmt);
+
+// Extract function to gather quantized K cache
+void cp_gather_indexer_k_cache(
+    const torch::Tensor& kv_cache,     // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,              // [num_tokens, head_dim]
+    const torch::Tensor& block_table,  // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
+
+// Extract function to gather quantized K cache
+void cp_gather_indexer_k_quant_cache(
+    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,           // [num_tokens, head_dim]
+    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
+    const torch::Tensor& block_table,   // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens);  // [batch_size + 1]
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -938,6 +938,124 @@ __global__ void indexer_k_cache_kernel(
     kv_cache[dst_offset + i] = k_val_ptr[i];
   }
 }
+
+template <int BLOCK_Y_SIZE>
+__global__ void cp_gather_indexer_k_quant_cache_kernel(
+    const char* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                        // cache_stride]
+    char* __restrict__ dst_k,           // [num_tokens, head_dim]
+    char* __restrict__ dst_scale,  // [num_tokens, head_dim / quant_block_size *
+                                   // 4]
+    const int* __restrict__ block_table,  // [batch_size, num_blocks]
+    const int* __restrict__ cu_seq_lens,  // [batch_size + 1]
+    const int batch_size,                 // batch size
+    const int64_t token_stride,           // stride for each token in dst_k
+    const int64_t head_dim,               // dimension of each head
+    const int64_t block_stride,           // stride for each block in kv_cache
+    const int64_t cache_token_stride,     // stride for each token in kv_cache
+    const int64_t cache_block_size,  // num_tokens for each block in kv_cache
+    const int num_blocks,            // number of blocks
+    const int num_tokens,            // number of tokens
+    const int quant_block_size       // quantization block size
+) {
+  constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
+  const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
+  // Find batch index within a block
+  __shared__ int batch_idx[BLOCK_Y_SIZE];
+  for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
+       iter++) {
+    int tid = iter * blockDim.x + threadIdx.x;
+    if (tid < batch_size) {
+      const int seq_start = cu_seq_lens[tid];
+      const int seq_end = cu_seq_lens[tid + 1];
+      if (token_idx >= seq_start && token_idx < seq_end) {
+        batch_idx[threadIdx.y] = tid;
+      }
+    }
+  }
+
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+
+  if (head_idx >= head_dim || token_idx >= num_tokens) {
+    return;
+  }
+  const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
+  const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
+                                    inbatch_seq_idx / cache_block_size];
+  const int64_t src_block_offset = block_idx * block_stride;
+  const int64_t cache_inblock_offset =
+      (inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
+  const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
+  const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
+
+  reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
+      reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
+  ;
+  if (threadIdx.x == 0) {
+    const int64_t src_scale_offset =
+        src_block_offset + cache_block_size * head_dim +
+        cache_inblock_offset * 4 / quant_block_size;
+    reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
+        reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
+  }
+}
+
+template <int BLOCK_Y_SIZE>
+__global__ void cp_gather_indexer_k_cache_kernel(
+    const char* __restrict__ kv_cache,    // [num_blocks, block_size,
+                                          // cache_stride]
+    char* __restrict__ dst_k,             // [num_tokens, head_dim]
+    const int* __restrict__ block_table,  // [batch_size, num_blocks]
+    const int* __restrict__ cu_seq_lens,  // [batch_size + 1]
+    const int batch_size,                 // batch size
+    const int64_t token_stride,           // stride for each token in dst_k
+    const int64_t head_dim,               // dimension of each head
+    const int64_t block_stride,           // stride for each block in kv_cache
+    const int64_t cache_token_stride,     // stride for each token in kv_cache
+    const int64_t cache_block_size,  // num_tokens for each block in kv_cache
+    const int num_blocks,            // number of blocks
+    const int num_tokens,            // number of tokens
+) {
+  constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
+  const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
+  // Find batch index within a block
+  __shared__ int batch_idx[BLOCK_Y_SIZE];
+  for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
+       iter++) {
+    int tid = iter * blockDim.x + threadIdx.x;
+    if (tid < batch_size) {
+      const int seq_start = cu_seq_lens[tid];
+      const int seq_end = cu_seq_lens[tid + 1];
+      if (token_idx >= seq_start && token_idx < seq_end) {
+        batch_idx[threadIdx.y] = tid;
+      }
+    }
+  }
+
+#ifndef USE_ROCM
+  __syncwarp();
+#endif
+
+  if (head_idx >= head_dim || token_idx >= num_tokens) {
+    return;
+  }
+  const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
+  const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
+                                    inbatch_seq_idx / cache_block_size];
+  const int64_t src_block_offset = block_idx * block_stride;
+  const int64_t cache_inblock_offset =
+      (inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
+  const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
+  const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
+
+  reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
+      reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
+}
+
 }  // namespace vllm
 
 // Macro to dispatch the kernel based on the data type.
@@ -1083,4 +1201,114 @@ void indexer_k_cache(
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "auto", CALL_INDEXER_K_CACHE);
+}
+
+// Macro to dispatch the kernel based on the data amount.
+#define CALL_CP_GATHER_INDEXER_K_CACHE(BLOCK_Y_SIZE)                        \
+  vllm::cp_gather_indexer_k_cache_kernel<BLOCK_Y_SIZE>                      \
+      <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \
+              (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \
+         dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \
+          reinterpret_cast<char*>(kv_cache.data_ptr()),                     \
+          reinterpret_cast<char*>(dst_k.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+          batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \
+          kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \
+          num_tokens);
+
+void cp_gather_indexer_k_cache(
+    const torch::Tensor& kv_cache,     // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,              // [num_tokens, head_dim]
+    const torch::Tensor& block_table,  // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens   // [batch_size + 1]
+) {
+  int batch_size = block_table.size(0);
+  int num_tokens = dst_k.size(0);
+  int head_dim = dst_k.size(1);
+  // int quant_block_size = head_dim * 4 / dst_scale.size(1);
+
+  TORCH_CHECK(kv_cache.device() == dst_k.device(),
+              "kv_cache and dst_k must be on the same device");
+  // TORCH_CHECK(kv_cache.device() == dst_scale.device(),
+  //             "kv_cache and dst_scale must be on the same device");
+  TORCH_CHECK(kv_cache.device() == block_table.device(),
+              "kv_cache and block_table must be on the same device");
+  TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
+              "kv_cache and cu_seq_lens must be on the same device");
+  // TORCH_CHECK(head_dim % quant_block_size == 0,
+  //             "head_dim must be divisible by quant_block_size");
+
+  constexpr int vec_size = 16;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (num_tokens < 32) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
+  } else if (num_tokens < 64) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
+  } else if (num_tokens < 128) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
+  } else if (num_tokens < 256) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
+  } else if (num_tokens < 512) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
+  } else {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
+  }
+}
+
+// Macro to dispatch the kernel based on the data amount.
+#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE)                  \
+  vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE>                \
+      <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \
+              (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \
+         dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \
+          reinterpret_cast<char*>(kv_cache.data_ptr()),                     \
+          reinterpret_cast<char*>(dst_k.data_ptr()),                        \
+          reinterpret_cast<char*>(dst_scale.data_ptr()),                    \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+          batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \
+          kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \
+          num_tokens, quant_block_size);
+
+void cp_gather_indexer_k_quant_cache(
+    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
+    torch::Tensor& dst_k,           // [num_tokens, head_dim]
+    torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4]
+    const torch::Tensor& block_table,  // [batch_size, num_blocks]
+    const torch::Tensor& cu_seq_lens   // [batch_size + 1]
+) {
+  int batch_size = block_table.size(0);
+  int num_tokens = dst_k.size(0);
+  int head_dim = dst_k.size(1);
+  int quant_block_size = head_dim * 4 / dst_scale.size(1);
+
+  TORCH_CHECK(kv_cache.device() == dst_k.device(),
+              "kv_cache and dst_k must be on the same device");
+  TORCH_CHECK(kv_cache.device() == dst_scale.device(),
+              "kv_cache and dst_scale must be on the same device");
+  TORCH_CHECK(kv_cache.device() == block_table.device(),
+              "kv_cache and block_table must be on the same device");
+  TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
+              "kv_cache and cu_seq_lens must be on the same device");
+  TORCH_CHECK(head_dim % quant_block_size == 0,
+              "head_dim must be divisible by quant_block_size");
+
+  constexpr int vec_size = 16;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (num_tokens < 32) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
+  } else if (num_tokens < 64) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
+  } else if (num_tokens < 128) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
+  } else if (num_tokens < 256) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
+  } else if (num_tokens < 512) {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
+  } else {
+    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
+  }
 }
diff --git a/vllm_metax/_custom_ops.py b/vllm_metax/_custom_ops.py
@@ -54,3 +54,20 @@ def indexer_k_quant_and_cache(k: torch.Tensor, kv_cache: torch.Tensor,
     else:
         torch.ops._C_cache_ops.indexer_k_quant_and_cache(
             k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype)
+
+def cp_gather_indexer_k_quant_cache(
+    kv_cache: torch.Tensor,
+    dst_k: torch.Tensor,
+    dst_scale: torch.Tensor,
+    block_table: torch.Tensor,
+    cu_seq_lens: torch.Tensor,
+) -> None:
+    
+    if dst_k.dtype in (torch.bfloat16, torch.float16) or dst_scale is None:
+        torch.ops._C_cache_ops.cp_gather_indexer_k_cache(
+            kv_cache, dst_k, block_table, cu_seq_lens
+        )
+    else:
+        torch.ops._C_cache_ops.cp_gather_indexer_k_quant_cache(
+            kv_cache, dst_k, dst_scale, block_table, cu_seq_lens
+        )
diff --git a/vllm_metax/models/deepseek_v2.py b/vllm_metax/models/deepseek_v2.py
@@ -506,49 +506,6 @@ def get_attn_backend(self) -> AttentionBackend:
         return DeepseekV32IndexerBackend
 
 
-@torch.inference_mode()
-def cp_gather_indexer_k_quant_cache(
-    kv_cache,  # [num_blocks, block_size, head_dim + 1]
-    dst_value,  # [cu_seq_lens[-1], head_dim]
-    block_table,  # [batch_size, num_blocks]
-    cu_seq_lens,  # [batch_size + 1, ]
-    batch_size,
-):
-    num_blocks, block_size, _ = kv_cache.shape
-    head_dim = dst_value.shape[-1]
-    kv_cache = kv_cache.view(num_blocks, -1)
-
-    expected_value = []
-    for b in range(batch_size):
-        s = cu_seq_lens[b + 1] - cu_seq_lens[b]
-        if s == 0:
-            continue
-        tot = cdiv(s, block_size)
-        blocks = block_table[b, :tot]
-
-        value = []
-        full_block = torch.arange(tot - 1, device=kv_cache.device, dtype=torch.int32)
-        non_remaining_value = kv_cache[
-            blocks[full_block], : block_size * head_dim
-        ].view(-1, head_dim)
-
-        remaining = s - (tot - 1) * block_size
-
-        value = torch.cat(
-            [
-                non_remaining_value,
-                kv_cache[blocks[-1], : remaining * head_dim].view(-1, head_dim),
-            ],
-            dim=0,
-        )
-
-        expected_value.append(value)
-
-    gather_value = torch.cat(expected_value, dim=0).view(-1, head_dim)
-    gather_value = gather_value.view(torch.bfloat16)
-    dst_value.copy_(gather_value)
-
-
 def sparse_attn_indexer(
     hidden_states: torch.Tensor,
     k_cache_prefix: str,
@@ -607,12 +564,13 @@ def sparse_attn_indexer(
                 device=k_bf16.device,
                 dtype=torch.bfloat16,
             )
-            cp_gather_indexer_k_quant_cache(
+            k_scale = None
+            mx_ops.cp_gather_indexer_k_quant_cache(
                 kv_cache,
                 _k_bf16,
+                k_scale,
                 chunk.block_table,
                 chunk.cu_seq_lens,
-                chunk.num_reqs,
             )
             logits = bf16_mqa_logits(
                 q_bf16[chunk.token_start : chunk.token_end],