From a887eb6496f39437db7e4534ad24299b1b018816 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 10 Mar 2025 16:27:44 +0000 Subject: [PATCH 1/3] Build sparse kernels on hopper Signed-off-by: Lucas Wilkinson --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5349b64aecb6..2d443068e4df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -423,6 +423,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor # require CUDA 12.2 or later (and only work on Hopper and Blackwell). + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set_gencode_flags_for_srcs( From 9f5f74336dbd85aa1a0d21c3e1d86b1abdbf4b2b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 10 Mar 2025 16:35:05 +0000 Subject: [PATCH 2/3] update comment Signed-off-by: Lucas Wilkinson --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d443068e4df..3a32df1835e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper and Blackwell). + # require CUDA 12.2 or later (and only work on Hopper). cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") From c1d88d9d83e12e6213300ce6bbe905711aed1886 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 10 Mar 2025 16:42:09 +0000 Subject: [PATCH 3/3] restrict to hopper Signed-off-by: Lucas Wilkinson --- csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index 8c408719e8ee..38b929be41c1 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -58,7 +58,9 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, // Guard against compilation issues for sm90 kernels #if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X - if (version_num >= 90) { + // We build for 9.0a which is not forward compatible, so restrict this to + // Hopper only + if (version_num == 90) { cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales, bias); return; @@ -82,7 +84,9 @@ std::vector cutlass_sparse_compress(torch::Tensor const& a) { // Guard against compilation issues for sm90 kernels #if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X - if (version_num >= 90) { + // We build for 9.0a which is not forward compatible, so restrict this to + // Hopper only + if (version_num == 90) { std::vector result_tensors; auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);