Skip to content

Commit c7f3d7c

Browse files
KyleFromNVIDIAjrbourbeau
authored andcommitted
Always build with JIT+LTO (rapidsai#1923)
Since rapidsai#1909, we've been able to use older versions of the CUDA driver, since we no longer rely on `cudaLibraryEnumerateKernels()`. Since rapidsai#1918, we've been using static cudart, which allows us to run on platforms with versions of CUDA older than 12.8 installed, since the runtime library API is now bundled with cuvs. Always build with JIT+LTO so that we can get the full compile time and binary size benefits in CUDA 12 too. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) Approvers: - Divye Gala (https://github.com/divyegala) - Ben Frederickson (https://github.com/benfred) - Bradley Dice (https://github.com/bdice) URL: rapidsai#1923
1 parent 8d5f3e3 commit c7f3d7c

13 files changed

Lines changed: 102 additions & 1472 deletions

conda/environments/all_cuda-129_arch-aarch64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ dependencies:
3030
- libcurand-dev
3131
- libcusolver-dev
3232
- libcusparse-dev
33+
- libnvjitlink-dev
3334
- librmm==26.6.*,>=0.0.0a0
3435
- make
3536
- nccl>=2.19

conda/environments/all_cuda-129_arch-x86_64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ dependencies:
3030
- libcurand-dev
3131
- libcusolver-dev
3232
- libcusparse-dev
33+
- libnvjitlink-dev
3334
- librmm==26.6.*,>=0.0.0a0
3435
- make
3536
- nccl>=2.19

conda/environments/bench_ann_cuda-129_arch-aarch64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ dependencies:
2929
- libcusolver-dev
3030
- libcusparse-dev
3131
- libcuvs==26.6.*,>=0.0.0a0
32+
- libnvjitlink-dev
3233
- librmm==26.6.*,>=0.0.0a0
3334
- matplotlib-base>=3.9
3435
- nccl>=2.19

conda/environments/bench_ann_cuda-129_arch-x86_64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ dependencies:
3131
- libcusolver-dev
3232
- libcusparse-dev
3333
- libcuvs==26.6.*,>=0.0.0a0
34+
- libnvjitlink-dev
3435
- librmm==26.6.*,>=0.0.0a0
3536
- matplotlib-base>=3.9
3637
- mkl-devel=2023

conda/environments/go_cuda-129_arch-aarch64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ dependencies:
2525
- libcusolver-dev
2626
- libcusparse-dev
2727
- libcuvs==26.6.*,>=0.0.0a0
28+
- libnvjitlink-dev
2829
- libraft==26.6.*,>=0.0.0a0
2930
- nccl>=2.19
3031
- ninja

conda/environments/go_cuda-129_arch-x86_64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ dependencies:
2525
- libcusolver-dev
2626
- libcusparse-dev
2727
- libcuvs==26.6.*,>=0.0.0a0
28+
- libnvjitlink-dev
2829
- libraft==26.6.*,>=0.0.0a0
2930
- nccl>=2.19
3031
- ninja

conda/environments/rust_cuda-129_arch-aarch64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- libcusolver-dev
2323
- libcusparse-dev
2424
- libcuvs==26.6.*,>=0.0.0a0
25+
- libnvjitlink-dev
2526
- libraft==26.6.*,>=0.0.0a0
2627
- make
2728
- nccl>=2.19

conda/environments/rust_cuda-129_arch-x86_64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies:
2222
- libcusolver-dev
2323
- libcusparse-dev
2424
- libcuvs==26.6.*,>=0.0.0a0
25+
- libnvjitlink-dev
2526
- libraft==26.6.*,>=0.0.0a0
2627
- make
2728
- nccl>=2.19

conda/recipes/libcuvs/recipe.yaml

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,7 @@ cache:
7272
- ninja
7373
- ${{ stdlib("c") }}
7474
host:
75-
- if: cuda_major == "13"
76-
then:
77-
- libnvjitlink-dev
75+
- libnvjitlink-dev
7876
- librmm =${{ minor_version }}
7977
- libraft-headers =${{ minor_version }}
8078
- nccl ${{ nccl_version }}
@@ -121,9 +119,7 @@ outputs:
121119
- libcurand-dev
122120
- libcusolver-dev
123121
- libcusparse-dev
124-
- if: cuda_major == "13"
125-
then:
126-
- libnvjitlink-dev
122+
- libnvjitlink-dev
127123
run:
128124
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
129125
- libraft-headers =${{ minor_version }}
@@ -182,9 +178,7 @@ outputs:
182178
- libcurand-dev
183179
- libcusolver-dev
184180
- libcusparse-dev
185-
- if: cuda_major == "13"
186-
then:
187-
- libnvjitlink-dev
181+
- libnvjitlink-dev
188182
run:
189183
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
190184
- ${{ pin_subpackage("libcuvs-headers", exact=True) }}
@@ -242,9 +236,7 @@ outputs:
242236
- libcurand-dev
243237
- libcusolver-dev
244238
- libcusparse-dev
245-
- if: cuda_major == "13"
246-
then:
247-
- libnvjitlink-dev
239+
- libnvjitlink-dev
248240
run:
249241
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
250242
- ${{ pin_subpackage("libcuvs-headers", exact=True) }}
@@ -401,9 +393,7 @@ outputs:
401393
- libcurand-dev
402394
- libcusolver-dev
403395
- libcusparse-dev
404-
- if: cuda_major == "13"
405-
then:
406-
- libnvjitlink-dev
396+
- libnvjitlink-dev
407397
run:
408398
- ${{ pin_subpackage("libcuvs-headers", exact=True) }}
409399
- ${{ pin_subpackage("libcuvs", exact=True) }}

cpp/CMakeLists.txt

Lines changed: 88 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -355,97 +355,90 @@ if(NOT BUILD_CPU_ONLY)
355355
)
356356
endif()
357357

358-
set(JIT_LTO_TARGET_ARCHITECTURE "")
359-
set(JIT_LTO_COMPILATION OFF)
360-
set(jit_lto_files)
358+
set(JIT_LTO_TARGET_ARCHITECTURE "70-real")
361359
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
362360
set(JIT_LTO_TARGET_ARCHITECTURE "75-real")
363-
set(JIT_LTO_COMPILATION ON)
364361
endif()
365362

366-
if(JIT_LTO_COMPILATION)
367-
# Generate interleaved scan kernel files at build time
368-
include(cmake/modules/generate_jit_lto_kernels.cmake)
363+
# Generate interleaved scan kernel files at build time
364+
include(cmake/modules/generate_jit_lto_kernels.cmake)
369365

370-
add_library(jit_lto_kernel_usage_requirements INTERFACE)
371-
target_include_directories(
372-
jit_lto_kernel_usage_requirements
373-
INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
374-
"${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
375-
)
376-
target_compile_options(
377-
jit_lto_kernel_usage_requirements INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
378-
"$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
379-
)
380-
target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
381-
target_link_libraries(
382-
jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL
383-
)
384-
385-
block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
386-
set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
387-
generate_jit_lto_kernels(
388-
interleaved_scan_files
389-
NAME_FORMAT
390-
"interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
391-
MATRIX_JSON_FILE
392-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
393-
KERNEL_INPUT_FILE
394-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in"
395-
EMBEDDED_INPUT_FILE
396-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in"
397-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
398-
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
399-
)
400-
generate_jit_lto_kernels(
401-
metric_files
402-
NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
403-
MATRIX_JSON_FILE
404-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
405-
KERNEL_INPUT_FILE
406-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in"
407-
EMBEDDED_INPUT_FILE
408-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in"
409-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
410-
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
411-
)
412-
generate_jit_lto_kernels(
413-
filter_files
414-
NAME_FORMAT "@filter_name@"
415-
MATRIX_JSON_FILE
416-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
417-
KERNEL_INPUT_FILE
418-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in"
419-
EMBEDDED_INPUT_FILE
420-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in"
421-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
422-
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
423-
)
424-
generate_jit_lto_kernels(
425-
post_lambda_files
426-
NAME_FORMAT "@post_lambda_name@"
427-
MATRIX_JSON_FILE
428-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
429-
KERNEL_INPUT_FILE
430-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in"
431-
EMBEDDED_INPUT_FILE
432-
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in"
433-
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
434-
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
435-
)
436-
endblock()
437-
438-
set(jit_lto_files
439-
${interleaved_scan_files}
440-
${metric_files}
441-
${filter_files}
442-
${post_lambda_files}
443-
src/detail/jit_lto/AlgorithmLauncher.cpp
444-
src/detail/jit_lto/AlgorithmPlanner.cpp
445-
src/detail/jit_lto/FragmentEntry.cpp
446-
src/detail/jit_lto/nvjitlink_checker.cpp
447-
)
448-
endif()
366+
add_library(jit_lto_kernel_usage_requirements INTERFACE)
367+
target_include_directories(
368+
jit_lto_kernel_usage_requirements
369+
INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src"
370+
"${CMAKE_CURRENT_SOURCE_DIR}/../c/include"
371+
)
372+
target_compile_options(
373+
jit_lto_kernel_usage_requirements INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
374+
"$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
375+
)
376+
target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20)
377+
target_link_libraries(jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL)
378+
379+
block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files)
380+
set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE})
381+
generate_jit_lto_kernels(
382+
interleaved_scan_files
383+
NAME_FORMAT
384+
"interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@"
385+
MATRIX_JSON_FILE
386+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json"
387+
KERNEL_INPUT_FILE
388+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in"
389+
EMBEDDED_INPUT_FILE
390+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in"
391+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan"
392+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
393+
)
394+
generate_jit_lto_kernels(
395+
metric_files
396+
NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@"
397+
MATRIX_JSON_FILE
398+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json"
399+
KERNEL_INPUT_FILE
400+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in"
401+
EMBEDDED_INPUT_FILE
402+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in"
403+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric"
404+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
405+
)
406+
generate_jit_lto_kernels(
407+
filter_files
408+
NAME_FORMAT "@filter_name@"
409+
MATRIX_JSON_FILE
410+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json"
411+
KERNEL_INPUT_FILE
412+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in"
413+
EMBEDDED_INPUT_FILE
414+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in"
415+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter"
416+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
417+
)
418+
generate_jit_lto_kernels(
419+
post_lambda_files
420+
NAME_FORMAT "@post_lambda_name@"
421+
MATRIX_JSON_FILE
422+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json"
423+
KERNEL_INPUT_FILE
424+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in"
425+
EMBEDDED_INPUT_FILE
426+
"${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in"
427+
OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda"
428+
KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
429+
)
430+
endblock()
431+
432+
set(jit_lto_files
433+
${interleaved_scan_files}
434+
${metric_files}
435+
${filter_files}
436+
${post_lambda_files}
437+
src/detail/jit_lto/AlgorithmLauncher.cpp
438+
src/detail/jit_lto/AlgorithmPlanner.cpp
439+
src/detail/jit_lto/FragmentEntry.cpp
440+
src/detail/jit_lto/nvjitlink_checker.cpp
441+
)
449442

450443
add_library(
451444
cuvs_objs OBJECT
@@ -674,10 +667,8 @@ if(NOT BUILD_CPU_ONLY)
674667
)
675668

676669
target_compile_definitions(
677-
cuvs_objs
678-
PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
679-
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
680-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
670+
cuvs_objs PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
671+
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
681672
)
682673

683674
target_link_libraries(
@@ -752,10 +743,8 @@ if(NOT BUILD_CPU_ONLY)
752743
"$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:Debug>>:${CUVS_DEBUG_CUDA_FLAGS}>"
753744
)
754745
target_compile_definitions(
755-
cuvs
756-
PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
757-
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
758-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
746+
cuvs PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
747+
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
759748
)
760749

761750
target_link_libraries(
@@ -767,11 +756,8 @@ if(NOT BUILD_CPU_ONLY)
767756
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:NCCL::NCCL>>
768757
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>>
769758
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
770-
PRIVATE rmm::rmm
771-
$<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
772-
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
773-
$<COMPILE_ONLY:cuco::cuco>
774-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
759+
PRIVATE rmm::rmm $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
760+
$<COMPILE_ONLY:nvidia::cutlass::cutlass> $<COMPILE_ONLY:cuco::cuco> CUDA::nvJitLink
775761
)
776762
set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON)
777763

@@ -810,10 +796,8 @@ SECTIONS
810796

811797
target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
812798
target_compile_definitions(
813-
cuvs_static
814-
PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
815-
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
816-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUVS_ENABLE_JIT_LTO>
799+
cuvs_static PUBLIC $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
800+
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
817801
)
818802

819803
target_include_directories(cuvs_static INTERFACE "$<INSTALL_INTERFACE:include>")
@@ -831,7 +815,7 @@ SECTIONS
831815
$<BUILD_LOCAL_INTERFACE:$<TARGET_NAME_IF_EXISTS:hnswlib::hnswlib>> # header only
832816
PRIVATE rmm::rmm
833817
$<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
834-
$<$<BOOL:${JIT_LTO_COMPILATION}>:CUDA::nvJitLink>
818+
CUDA::nvJitLink
835819
$<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>
836820
$<COMPILE_ONLY:nvidia::cutlass::cutlass>
837821
$<COMPILE_ONLY:cuco::cuco>

0 commit comments

Comments
 (0)