-
-
Notifications
You must be signed in to change notification settings - Fork 12k
[NVIDIA] Blackwell Family #24673
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
[NVIDIA] Blackwell Family #24673
Changes from all commits
Commits
Show all changes
34 commits
Select commit
Hold shift + click to select a range
90df7a0
Update CMakeLists.txt
johnnynunez 7e0ad57
Merge branch 'vllm-project:main' into main
johnnynunez 8b47abf
Merge branch 'vllm-project:main' into main
johnnynunez 32799f5
Merge branch 'vllm-project:main' into main
johnnynunez 985939d
Merge branch 'vllm-project:main' into main
johnnynunez 2abe3d8
Merge branch 'vllm-project:main' into main
johnnynunez 8873abf
Update CMakeLists.txt
johnnynunez bf3e85f
Update CMakeLists.txt
johnnynunez 04c0a30
cuda <= 12.8
johnnynunez f045cca
cutlass 4.2.0
johnnynunez cc0bcc7
Update CMakeLists.txt
johnnynunez d042189
ropping in the exact, minimal changes you need so family-conditional …
johnnynunez ccee6ea
Update CMakeLists.txt
johnnynunez 40a67d2
Update utils.cmake
johnnynunez 1f770bc
Update comments
johnnynunez 66168cf
Update CMakeLists.txt
johnnynunez 78dd97b
Update CMakeLists.txt
johnnynunez 76a983d
Update CMakeLists.txt
johnnynunez 1e4b5b7
fix cmake
johnnynunez eae5c72
remove duplicate kernel
johnnynunez ebfebee
Update scaled_mm_sm100_fp8_dispatch.cuh
johnnynunez b8ba625
revert, waiting for fix
johnnynunez c7891a9
upgrade cutlass version
johnnynunez 4481acb
fix kernel
johnnynunez 124052c
Merge branch 'main' into patch-1
johnnynunez 2bda7a5
fix
johnnynunez f237098
add compression mode for cuda >=13
johnnynunez fa6a04c
Merge branch 'vllm-project:main' into patch-1
johnnynunez a97af43
Update scaled_mm_entry.cu
DrStone1971 1ac2fc5
Merge pull request #2 from DrStone71/patch-2
johnnynunez 88d88d1
precommit
johnnynunez ac301ea
Merge branch 'vllm-project:main' into patch-1
johnnynunez 82ee77a
fix correct support
johnnynunez 56b0ff7
Merge branch 'vllm-project:main' into patch-1
johnnynunez File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,6 +86,9 @@ find_package(Torch REQUIRED) | |
| # Supported NVIDIA architectures. | ||
| # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined | ||
| if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND | ||
| CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) | ||
| set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0") | ||
| elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND | ||
| CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) | ||
| set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") | ||
| else() | ||
|
|
@@ -175,6 +178,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") | |
| list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") | ||
| endif() | ||
|
|
||
| # | ||
| # Set compression mode for CUDA >=13.x. | ||
| # | ||
| if(VLLM_GPU_LANG STREQUAL "CUDA" AND | ||
| DEFINED CMAKE_CUDA_COMPILER_VERSION AND | ||
| CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) | ||
| list(APPEND VLLM_GPU_FLAGS "--compress-mode=size") | ||
| endif() | ||
|
|
||
| # | ||
| # Set CUDA include flags for CXX compiler. | ||
| # | ||
|
|
@@ -270,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") | ||
|
|
||
| # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. | ||
| set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") | ||
| set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") | ||
|
|
||
| # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided | ||
| if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) | ||
|
|
@@ -305,7 +317,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" | ||
| "csrc/quantization/fp4/nvfp4_quant_entry.cu" | ||
| "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" | ||
| "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" | ||
| "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" | ||
| "csrc/cutlass_extensions/common.cpp" | ||
| "csrc/quantization/fp8/per_token_group_quant.cu") | ||
|
|
@@ -440,7 +451,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
|
|
||
| # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require | ||
| # CUDA 12.8 or later | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||
| set(SRCS | ||
| "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" | ||
|
|
@@ -470,7 +485,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
|
|
||
| # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) | ||
| # require CUDA 12.8 or later | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||
| set(SRCS | ||
| "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" | ||
|
|
@@ -550,7 +569,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
|
|
||
| # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require | ||
| # CUDA 12.8 or later | ||
| cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) | ||
| set(SRCS | ||
| "csrc/quantization/fp4/nvfp4_quant_kernels.cu" | ||
|
|
@@ -569,7 +592,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| endif() | ||
|
|
||
| # FP4 Archs and flags | ||
| cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) | ||
| set(SRCS | ||
| "csrc/quantization/fp4/nvfp4_quant_kernels.cu" | ||
|
|
@@ -591,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| endif() | ||
|
|
||
| # CUTLASS MLA Archs and flags | ||
| cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) | ||
| set(SRCS | ||
| "csrc/attention/mla/sm100_cutlass_mla_kernel.cu") | ||
|
|
@@ -635,7 +666,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| endif() | ||
| endif() | ||
|
|
||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||
| set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu") | ||
| set_gencode_flags_for_srcs( | ||
|
|
@@ -656,7 +691,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| endif() | ||
|
|
||
| # moe_data.cu is used by all CUTLASS MoE kernels. | ||
| cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is it 13.0 rather than 12.9?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) | ||
| set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") | ||
| set_gencode_flags_for_srcs( | ||
|
|
@@ -675,7 +714,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | |
| endif() | ||
| endif() | ||
|
|
||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||
| else() | ||
| cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||
| endif() | ||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||
| set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu") | ||
| set_gencode_flags_for_srcs( | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out, | |
| } | ||
| } | ||
|
|
||
| } // namespace vllm | ||
| } // namespace vllm | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please clarify where this is duplicated?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am also curious about this