@@ -143,6 +143,19 @@ else()
143143 message (FATAL_ERROR "Can't find CUDA or HIP installation." )
144144endif ()
145145
146+
147+ #
148+ # For cuda we want to be able to control which architectures we compile for on
149+ # a per-file basis in order to cut down on compile time. So here we extract
150+ # the set of architectures we want to compile for and remove the from the
151+ # CMAKE_CUDA_FLAGS so that they are not applied globally.
152+ #
153+ if (VLLM_GPU_LANG STREQUAL "CUDA" )
154+ clear_cuda_arches(CUDA_ARCH_FLAGS)
155+ extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS} " )
156+ message (STATUS "CUDA target architectures: ${CUDA_ARCHS} " )
157+ endif ()
158+
146159#
147160# Override the GPU architectures detected by cmake/torch and filter them by
148161# the supported versions for the current language.
@@ -223,78 +236,162 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
223236 "csrc/mamba/causal_conv1d/causal_conv1d.cu"
224237 "csrc/quantization/aqlm/gemm_kernels.cu"
225238 "csrc/quantization/awq/gemm_kernels.cu"
226- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
227- "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
228- "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
229- "csrc/quantization/gptq_marlin/gptq_marlin.cu"
230- "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
231- "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
232239 "csrc/quantization/gguf/gguf_kernel.cu"
233- "csrc/quantization/fp8/fp8_marlin.cu"
234240 "csrc/custom_all_reduce.cu"
235241 "csrc/permute_cols.cu"
236- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
237- "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
238- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
242+ "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" )
243+
244+ set_gencode_flags_for_srcs(
245+ SRCS "${VLLM_EXT_SRC} "
246+ CUDA_ARCHS "${CUDA_ARCHS} " )
247+
248+ # Only build Marlin kernels if we are building for at least some compatible archs.
249+ # Keep building Marlin for 9.0 as there are some group sizes and shapes that
250+ # are not supported by Machete yet.
251+ cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS} )
252+ if (MARLIN_ARCHS)
253+ set (MARLIN_SRCS
254+ "csrc/quantization/fp8/fp8_marlin.cu"
255+ "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
256+ "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
257+ "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
258+ "csrc/quantization/gptq_marlin/gptq_marlin.cu"
259+ "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
260+ "csrc/quantization/gptq_marlin/awq_marlin_repack.cu" )
261+ set_gencode_flags_for_srcs(
262+ SRCS "${MARLIN_SRCS} "
263+ CUDA_ARCHS "${MARLIN_ARCHS} " )
264+ list (APPEND VLLM_EXT_SRC "${MARLIN_SRCS} " )
265+ message (STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS} " )
266+ else ()
267+ message (STATUS "Not building Marlin kernels as no compatible archs found"
268+ "in CUDA target architectures" )
269+ endif ()
270+
271+ #
272+ # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
273+ # CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
274+ cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS} " )
275+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
276+ set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" )
277+ set_gencode_flags_for_srcs(
278+ SRCS "${SRCS} "
279+ CUDA_ARCHS "${SCALED_MM_3X_ARCHS} " )
280+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
281+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1" )
282+ message (STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS} " )
283+ else ()
284+ # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
285+ # build any 3x kernels
286+ set (SCALED_MM_3X_ARCHS)
287+
288+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
289+ message (STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
290+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
291+ "later if you intend on running FP8 quantized models on "
292+ "Hopper." )
293+ else ()
294+ message (STATUS "Not building scaled_mm_c3x as no compatible archs found "
295+ "in CUDA target architectures" )
296+ endif ()
297+ endif ()
239298
240299 #
241- # The CUTLASS kernels for Hopper require sm90a to be enabled.
242- # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
243- # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
244- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
245- set_source_files_properties (
246- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
247- PROPERTIES
248- COMPILE_FLAGS
249- "-gencode arch=compute_90a,code=sm_90a" )
300+ # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
301+ # kernels for the remaining archs that are not already built for 3x.
302+ cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
303+ "7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS} " )
304+ # subtract out the archs that are already built for 3x
305+ list (REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS} )
306+ if (SCALED_MM_2X_ARCHS)
307+ set (SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" )
308+ set_gencode_flags_for_srcs(
309+ SRCS "${SRCS} "
310+ CUDA_ARCHS "${SCALED_MM_2X_ARCHS} " )
311+ list (APPEND VLLM_EXT_SRC "${SRCS} " )
312+ list (APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1" )
313+ message (STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS} " )
314+ else ()
315+ if (SCALED_MM_3X_ARCHS)
316+ message (STATUS "Not building scaled_mm_c2x as all archs are already built"
317+ " for and covered by scaled_mm_c3x" )
318+ else ()
319+ message (STATUS "Not building scaled_mm_c2x as no compatible archs found "
320+ "in CUDA target architectures" )
321+ endif ()
250322 endif ()
251323
252324
253325 #
254326 # Machete kernels
255327
256328 # The machete kernels only work on hopper and require CUDA 12.0 or later.
257- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
329+ # Only build Machete kernels if we are building for something compatible with sm90a
330+ cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS} " )
331+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
258332 #
259333 # For the Machete kernels we automatically generate sources for various
260334 # preselected input type pairs and schedules.
261335 # Generate sources:
262- execute_process (
263- COMMAND ${CMAKE_COMMAND} -E env
264- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR} /csrc/cutlass_extensions/:${CUTLASS_DIR} /python/:${VLLM_PYTHON_PATH} :$PYTHONPATH
265- ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR} /csrc/quantization/machete/generate.py
266- RESULT_VARIABLE machete_generation_result
267- OUTPUT_VARIABLE machete_generation_output
268- OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
269- ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
270- )
271-
272- if (NOT machete_generation_result EQUAL 0)
273- message (FATAL_ERROR "Machete generation failed."
274- " Result: \" ${machete_generation_result} \" "
275- "\n Check the log for details: "
276- "${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log" )
336+ set (MACHETE_GEN_SCRIPT
337+ ${CMAKE_CURRENT_SOURCE_DIR} /csrc/quantization/machete/generate.py)
338+ file (MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
339+
340+ message (STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH} " )
341+ message (STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH} " )
342+
343+ if (NOT DEFINED CACHE {MACHETE_GEN_SCRIPT_HASH}
344+ OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH} )
345+ execute_process (
346+ COMMAND ${CMAKE_COMMAND} -E env
347+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR} /csrc/cutlass_extensions/:${CUTLASS_DIR} /python/:${VLLM_PYTHON_PATH} :$PYTHONPATH
348+ ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
349+ RESULT_VARIABLE machete_generation_result
350+ OUTPUT_VARIABLE machete_generation_output
351+ OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
352+ ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log
353+ )
354+
355+ if (NOT machete_generation_result EQUAL 0)
356+ message (FATAL_ERROR "Machete generation failed."
357+ " Result: \" ${machete_generation_result} \" "
358+ "\n Check the log for details: "
359+ "${CMAKE_CURRENT_BINARY_DIR} /machete_generation.log" )
360+ else ()
361+ set (MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
362+ CACHE STRING "Last run machete generate script hash" FORCE)
363+ message (STATUS "Machete generation completed successfully." )
364+ endif ()
277365 else ()
278- message (STATUS "Machete generation completed successfully ." )
366+ message (STATUS "Machete generation script has not changed, skipping generation ." )
279367 endif ()
280368
281369 # Add machete generated sources
282370 file (GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu" )
283371 list (APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES} )
284- message (STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES} " )
285372
286- set_source_files_properties (
287- ${MACHETE_GEN_SOURCES}
288- PROPERTIES
289- COMPILE_FLAGS
290- "-gencode arch=compute_90a,code=sm_90a" )
373+ # forward compatible
374+ set_gencode_flags_for_srcs(
375+ SRCS "${MACHETE_GEN_SOURCES} "
376+ CUDA_ARCHS "${MACHETE_ARCHS} " )
377+
378+ list (APPEND VLLM_EXT_SRC
379+ csrc/quantization/machete/machete_pytorch.cu)
380+
381+ message (STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS} " )
382+ else ()
383+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
384+ AND MACHETE_ARCHS)
385+ message (STATUS "Not building Machete kernels as CUDA Compiler version is "
386+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
387+ "later if you intend on running w4a16 quantized models on "
388+ "Hopper." )
389+ else ()
390+ message (STATUS "Not building Machete kernels as no compatible archs "
391+ "found in CUDA target architectures" )
392+ endif ()
291393 endif ()
292-
293- # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
294- # raise an error if the user that this was built with an incompatible
295- # CUDA version)
296- list (APPEND VLLM_EXT_SRC
297- csrc/quantization/machete/machete_pytorch.cu)
394+ # if CUDA endif
298395endif ()
299396
300397message (STATUS "Enabling C extension." )
@@ -323,14 +420,31 @@ set(VLLM_MOE_EXT_SRC
323420 "csrc/moe/torch_bindings.cpp"
324421 "csrc/moe/topk_softmax_kernels.cu" )
325422
423+ set_gencode_flags_for_srcs(
424+ SRCS "${VLLM_MOE_EXT_SRC} "
425+ CUDA_ARCHS "${CUDA_ARCHS} " )
426+
326427if (VLLM_GPU_LANG STREQUAL "CUDA" )
327- list (APPEND VLLM_MOE_EXT_SRC
328- "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
329- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
330- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
331- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
332- "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
333- "csrc/moe/marlin_moe_ops.cu" )
428+ cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS} " )
429+ if (MARLIN_MOE_ARCHS)
430+ set (MARLIN_MOE_SRC
431+ "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
432+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
433+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
434+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
435+ "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
436+ "csrc/moe/marlin_moe_ops.cu" )
437+
438+ set_gencode_flags_for_srcs(
439+ SRCS "${MARLIN_MOE_SRC} "
440+ CUDA_ARCHS "${MARLIN_MOE_ARCHS} " )
441+
442+ list (APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC} " )
443+ message (STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS} " )
444+ else ()
445+ message (STATUS "Not building Marlin MOE kernels as no compatible archs found"
446+ "in CUDA target architectures" )
447+ endif ()
334448endif ()
335449
336450message (STATUS "Enabling moe extension." )
0 commit comments