@@ -86,6 +86,9 @@ find_package(Torch REQUIRED)
8686# Supported NVIDIA architectures.
8787# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
8888if (DEFINED CMAKE_CUDA_COMPILER_VERSION AND
89+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
90+ set (CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0" )
91+ elseif (DEFINED CMAKE_CUDA_COMPILER_VERSION AND
8992 CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
9093 set (CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" )
9194else ()
@@ -175,6 +178,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
175178 list (APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS} " )
176179endif ()
177180
181+ #
182+ # Set compression mode for CUDA >=13.x.
183+ #
184+ if (VLLM_GPU_LANG STREQUAL "CUDA" AND
185+ DEFINED CMAKE_CUDA_COMPILER_VERSION AND
186+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
187+ list (APPEND VLLM_GPU_FLAGS "--compress-mode=size" )
188+ endif ()
189+
178190#
179191# Set CUDA include flags for CXX compiler.
180192#
@@ -270,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
270282 SET (CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" )
271283
272284 # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
273- set (CUTLASS_REVISION "v4.0.0 " CACHE STRING "CUTLASS revision to use" )
285+ set (CUTLASS_REVISION "v4.2.1 " CACHE STRING "CUTLASS revision to use" )
274286
275287 # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
276288 if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -305,7 +317,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
305317 "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
306318 "csrc/quantization/fp4/nvfp4_quant_entry.cu"
307319 "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
308- "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
309320 "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
310321 "csrc/cutlass_extensions/common.cpp"
311322 "csrc/quantization/fp8/per_token_group_quant.cu" )
@@ -440,7 +451,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
440451
441452 # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
442453 # CUDA 12.8 or later
443- cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS} " )
454+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
455+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS} " )
456+ else ()
457+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS} " )
458+ endif ()
444459 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
445460 set (SRCS
446461 "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
@@ -470,7 +485,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
470485
471486 # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
472487 # require CUDA 12.8 or later
473- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS} " )
488+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
489+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
490+ else ()
491+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
492+ endif ()
474493 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
475494 set (SRCS
476495 "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@@ -550,7 +569,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
550569
551570 # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
552571 # CUDA 12.8 or later
553- cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS} " )
572+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
573+ cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS} " )
574+ else ()
575+ cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS} " )
576+ endif ()
554577 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
555578 set (SRCS
556579 "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -569,7 +592,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
569592 endif ()
570593
571594 # FP4 Archs and flags
572- cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS} " )
595+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
596+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
597+ else ()
598+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS} " )
599+ endif ()
573600 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
574601 set (SRCS
575602 "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -591,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
591618 endif ()
592619
593620 # CUTLASS MLA Archs and flags
594- cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS} " )
621+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
622+ cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
623+ else ()
624+ cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
625+ endif ()
595626 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
596627 set (SRCS
597628 "csrc/attention/mla/sm100_cutlass_mla_kernel.cu" )
@@ -635,7 +666,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
635666 endif ()
636667 endif ()
637668
638- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
669+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
670+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS} " )
671+ else ()
672+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
673+ endif ()
639674 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
640675 set (SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu" )
641676 set_gencode_flags_for_srcs(
@@ -656,7 +691,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
656691 endif ()
657692
658693 # moe_data.cu is used by all CUTLASS MoE kernels.
659- cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS} " )
694+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
695+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
696+ else ()
697+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
698+ endif ()
660699 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
661700 set (SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu" )
662701 set_gencode_flags_for_srcs(
@@ -675,7 +714,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
675714 endif ()
676715 endif ()
677716
678- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS} " )
717+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
718+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS} " )
719+ else ()
720+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS} " )
721+ endif ()
679722 if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
680723 set (SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu" )
681724 set_gencode_flags_for_srcs(
0 commit comments