diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c2aba4ce9..d1c141b750 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,6 @@ set(FAISS_LANGUAGES CXX) if(FAISS_ENABLE_GPU) if (FAISS_ENABLE_ROCM) - set(USE_ROCM TRUE) list(APPEND FAISS_LANGUAGES HIP) list(PREPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake") list(PREPEND CMAKE_PREFIX_PATH "/opt/rocm") @@ -66,9 +65,9 @@ option(FAISS_ENABLE_PYTHON "Build Python extension." ON) option(FAISS_ENABLE_C_API "Build C API." OFF) if(FAISS_ENABLE_GPU) - if(USE_ROCM) + if(FAISS_ENABLE_ROCM) enable_language(HIP) - add_definitions(-DUSE_ROCM) + add_definitions(-DUSE_AMD_ROCM) find_package(HIP REQUIRED) find_package(hipBLAS REQUIRED) set(GPU_EXT_PREFIX "hip") @@ -83,15 +82,10 @@ if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft) find_package(raft COMPONENTS compiled distributed) endif() -if(USE_ROCM) - find_package(HIP REQUIRED) - find_package(hipBLAS REQUIRED) -endif() - add_subdirectory(faiss) if(FAISS_ENABLE_GPU) - if(USE_ROCM) + if(FAISS_ENABLE_ROCM) add_subdirectory(faiss/gpu-rocm) else() add_subdirectory(faiss/gpu) @@ -116,7 +110,7 @@ if(BUILD_TESTING) add_subdirectory(tests) if(FAISS_ENABLE_GPU) - if(USE_ROCM) + if(FAISS_ENABLE_ROCM) add_subdirectory(faiss/gpu-rocm/test) else() add_subdirectory(faiss/gpu/test) diff --git a/c_api/CMakeLists.txt b/c_api/CMakeLists.txt index 06d85c6aef..9e01aabff7 100644 --- a/c_api/CMakeLists.txt +++ b/c_api/CMakeLists.txt @@ -56,7 +56,7 @@ add_executable(example_c EXCLUDE_FROM_ALL example_c.c) target_link_libraries(example_c PRIVATE faiss_c) if(FAISS_ENABLE_GPU) - if(USE_ROCM) + if(FAISS_ENABLE_ROCM) add_subdirectory(gpu-rocm) else () add_subdirectory(gpu) diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt index 32b374ece9..2fa1209c40 100644 --- a/c_api/gpu/CMakeLists.txt +++ b/c_api/gpu/CMakeLists.txt @@ -15,13 +15,13 @@ target_sources(faiss_c PRIVATE file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h") faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu) -if (USE_ROCM) -find_package(HIP REQUIRED) -find_package(hipBLAS REQUIRED) -target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas) +if (FAISS_ENABLE_ROCM) + target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas) else() -find_package(CUDAToolkit REQUIRED) -target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) + find_package(CUDAToolkit REQUIRED) + target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) endif() add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c) diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index c97bae7032..b843622661 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -197,7 +197,7 @@ function(generate_ivf_interleaved_code) "64|2048|8" ) - if (USE_ROCM) + if (FAISS_ENABLE_ROCM) list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip) endif() @@ -294,7 +294,7 @@ if(FAISS_ENABLE_RAFT) target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1) endif() -if (USE_ROCM) +if (FAISS_ENABLE_ROCM) list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip) endif() @@ -313,8 +313,8 @@ foreach(header ${FAISS_GPU_HEADERS}) ) endforeach() -if (USE_ROCM) - target_link_libraries(faiss_gpu PRIVATE $<$:hip::host> $<$:roc::hipblas>) +if (FAISS_ENABLE_ROCM) + target_link_libraries(faiss_gpu PRIVATE hip::host roc::hipblas) target_compile_options(faiss_gpu PRIVATE) else() # Prepares a host linker script and enables host linker to support @@ -333,6 +333,13 @@ else() target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld") find_package(CUDAToolkit REQUIRED) - target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$:raft::raft> $<$:raft::compiled> $<$:nvidia::cutlass::cutlass> $<$:OpenMP::OpenMP_CXX>) - target_compile_options(faiss_gpu PRIVATE $<$:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$:-Xcompiler=${OpenMP_CXX_FLAGS}>>) + target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas + $<$:raft::raft> + $<$:raft::compiled> + $<$:nvidia::cutlass::cutlass> + $<$:OpenMP::OpenMP_CXX>) + target_compile_options(faiss_gpu PRIVATE + $<$:-Xfatbin=-compress-all + --expt-extended-lambda --expt-relaxed-constexpr + $<$:-Xcompiler=${OpenMP_CXX_FLAGS}>>) endif() diff --git a/faiss/gpu/GpuFaissAssert.h b/faiss/gpu/GpuFaissAssert.h index 7d36fbd8b5..4986a0bca5 100644 --- a/faiss/gpu/GpuFaissAssert.h +++ b/faiss/gpu/GpuFaissAssert.h @@ -15,7 +15,7 @@ /// Assertions /// -#if defined(__CUDA_ARCH__) || defined(USE_ROCM) +#if defined(__CUDA_ARCH__) || defined(USE_AMD_ROCM) #define GPU_FAISS_ASSERT(X) assert(X) #define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X) #define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X) diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index ae3c8793e7..059a6049de 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -363,7 +363,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { prop.major, prop.minor); -#if USE_ROCM +#if USE_AMD_ROCM // Our code is pre-built with and expects warpSize == 32 or 64, validate // that FAISS_ASSERT_FMT( diff --git a/faiss/gpu/hipify.sh b/faiss/gpu/hipify.sh index 09d466545e..dc0af11a7e 100755 --- a/faiss/gpu/hipify.sh +++ b/faiss/gpu/hipify.sh @@ -3,35 +3,35 @@ # go one level up from faiss/gpu top=$(dirname "${BASH_SOURCE[0]}")/.. echo "top=$top" -cd $top -echo "pwd=`pwd`" +cd "$top" || exit +echo "pwd=$(pwd)" # create all destination directories for hipified files into sibling 'gpu-rocm' directory -for src in $(find ./gpu -type d) +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's/gpu/gpu-rocm/') + dst="${src//gpu/gpu-rocm}" echo "Creating $dst" - mkdir -p $dst -done + mkdir -p "$dst" +done < <(find ./gpu -type d -print0) # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming # run all files in parallel to speed up for ext in cu cuh h cpp do - for src in $(find ./gpu -name "*.$ext") + while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - hipify-perl -o=$dst.tmp $src & - done + dst="${src//\.\/gpu/\.\/gpu-rocm}" + hipify-perl -o="$dst.tmp" "$src" & + done < <(find ./gpu -name "*.$ext" -print0) done wait # rename all hipified *.cu files to *.hip -for src in $(find ./gpu-rocm -name "*.cu.tmp") +while IFS= read -r -d '' src do dst=${src%.cu.tmp}.hip.tmp - mv $src $dst -done + mv "$src" "$dst" +done < <(find ./gpu-rocm -name "*.cu.tmp" -print0) # replace header include statements "@#include @' $src - sed -i 's@#include @#include @' $src - done + sed -i 's@#include @#include @' "$src" + sed -i 's@#include @#include @' "$src" + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # hipify was run in parallel above # don't copy the tmp file if it is unchanged for ext in hip cuh h cpp do - for src in $(find ./gpu-rocm -name "*.$ext.tmp") + while IFS= read -r -d '' src do dst=${src%.tmp} - if test -f $dst + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" - rm $src + rm "$src" else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi - done + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # copy over CMakeLists.txt -for src in $(find ./gpu -name "CMakeLists.txt") +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - if test -f $dst + dst="${src//\.\/gpu/\.\/gpu-rocm}" + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi -done +done < <(find ./gpu -name "CMakeLists.txt" -print0) # Copy over other files -for ext in py +other_exts="py" +for ext in $other_exts do - for src in $(find ./gpu -name "*.$ext") + while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - if test -f $dst + dst="${src//\.\/gpu/\.\/gpu-rocm}" + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi - done + done < <(find ./gpu -name "*.$ext" -print0) done - ################################################################################### # C_API Support ################################################################################### @@ -122,36 +122,36 @@ done # This points to the faiss/c_api dir top_c_api=$(dirname "${BASH_SOURCE[0]}")/../../c_api echo "top=$top_c_api" -cd ../$top_c_api -echo "pwd=`pwd`" +cd "../$top_c_api" || exit +echo "pwd=$(pwd)" # create all destination directories for hipified files into sibling 'gpu-rocm' directory -for src in $(find ./gpu -type d) +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's/gpu/gpu-rocm/') + dst="${src//gpu/gpu-rocm}" echo "Creating $dst" - mkdir -p $dst -done + mkdir -p "$dst" +done < <(find ./gpu -type d -print0) # run hipify-perl against all *.cu *.cuh *.h *.cpp files, no renaming # run all files in parallel to speed up for ext in cu cuh h cpp c do - for src in $(find ./gpu -name "*.$ext") + while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - hipify-perl -o=$dst.tmp $src & - done + dst="${src//\.\/gpu/\.\/gpu-rocm}" + hipify-perl -o="$dst.tmp" "$src" & + done < <(find ./gpu -name "*.$ext" -print0) done wait # rename all hipified *.cu files to *.hip -for src in $(find ./gpu-rocm -name "*.cu.tmp") +while IFS= read -r -d '' src do dst=${src%.cu.tmp}.hip.tmp - mv $src $dst -done + mv "$src" "$dst" +done < <(find ./gpu-rocm -name "*.cu.tmp" -print0) # replace header include statements "@#include @' $src - sed -i 's@#include @#include @' $src - done + sed -i 's@#include @#include @' "$src" + sed -i 's@#include @#include @' "$src" + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # hipify was run in parallel above # don't copy the tmp file if it is unchanged for ext in hip cuh h cpp c do - for src in $(find ./gpu-rocm -name "*.$ext.tmp") + while IFS= read -r -d '' src do dst=${src%.tmp} - if test -f $dst + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" - rm $src + rm "$src" else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi else echo "$dst" - mv $src $dst + mv "$src" "$dst" fi - done + done < <(find ./gpu-rocm -name "*.$ext.tmp" -print0) done # copy over CMakeLists.txt -for src in $(find ./gpu -name "CMakeLists.txt") +while IFS= read -r -d '' src do - dst=$(echo $src | sed 's@./gpu@./gpu-rocm@') - if test -f $dst + dst="${src//\.\/gpu/\.\/gpu-rocm}" + if test -f "$dst" then - if diff -q $src $dst >& /dev/null + if diff -q "$src" "$dst" >& /dev/null then echo "$dst [unchanged]" else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi else echo "$dst" - cp $src $dst + cp "$src" "$dst" fi -done +done < <(find ./gpu -name "CMakeLists.txt" -print0) diff --git a/faiss/gpu/impl/IVFAppend.cu b/faiss/gpu/impl/IVFAppend.cu index 8ee85eaed8..65af470cd3 100644 --- a/faiss/gpu/impl/IVFAppend.cu +++ b/faiss/gpu/impl/IVFAppend.cu @@ -411,7 +411,8 @@ __global__ void ivfInterleavedAppend( EncodeT* listStart = ((EncodeT*)listData[listId]); // Each warp within the block handles a different chunk of kWarpSize - auto warpVec = alignedListVecStart + warpId * kWarpSize; + auto warpVec = alignedListVecStart + + (faiss::gpu::Tensor::DataType)warpId * kWarpSize; // The warp data starts here EncodeT* warpData = listStart + (warpVec / kWarpSize) * wordsPerVectorBlock; diff --git a/faiss/gpu/impl/InterleavedCodes.cpp b/faiss/gpu/impl/InterleavedCodes.cpp index bd9464d5c8..5a2dc4b770 100644 --- a/faiss/gpu/impl/InterleavedCodes.cpp +++ b/faiss/gpu/impl/InterleavedCodes.cpp @@ -168,7 +168,7 @@ void unpackInterleavedWord( int dims, int bitsPerCode) { int warpSize = getWarpSizeCurrentDevice(); - int wordsPerDimBlock = warpSize * bitsPerCode / (8 * sizeof(T)); + int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T)); int wordsPerBlock = wordsPerDimBlock * dims; int numBlocks = utils::divUp(numVecs, warpSize); @@ -446,7 +446,7 @@ void packInterleavedWord( int dims, int bitsPerCode) { int warpSize = getWarpSizeCurrentDevice(); - int wordsPerDimBlock = warpSize * bitsPerCode / (8 * sizeof(T)); + int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T)); int wordsPerBlock = wordsPerDimBlock * dims; int numBlocks = utils::divUp(numVecs, warpSize); diff --git a/faiss/gpu/impl/PQCodeDistances-inl.cuh b/faiss/gpu/impl/PQCodeDistances-inl.cuh index 4306426e2d..f054915b7e 100644 --- a/faiss/gpu/impl/PQCodeDistances-inl.cuh +++ b/faiss/gpu/impl/PQCodeDistances-inl.cuh @@ -20,7 +20,7 @@ namespace faiss { namespace gpu { -#if defined(USE_ROCM) && __AMDGCN_WAVEFRONT_SIZE == 64u +#if defined(USE_AMD_ROCM) && __AMDGCN_WAVEFRONT_SIZE == 64u #define LAUNCH_BOUND 320 #else #define LAUNCH_BOUND 288 diff --git a/faiss/gpu/impl/PQCodeLoad.cuh b/faiss/gpu/impl/PQCodeLoad.cuh index a37e908a1d..fcca5c3ad1 100644 --- a/faiss/gpu/impl/PQCodeLoad.cuh +++ b/faiss/gpu/impl/PQCodeLoad.cuh @@ -47,7 +47,7 @@ inline __device__ unsigned int getByte(uint64_t v, int pos, int width) { return getBitfield(v, pos, width); } -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM template struct LoadCode32 {}; @@ -276,7 +276,7 @@ struct LoadCode32<96> { } }; -#else // USE_ROCM +#else // USE_AMD_ROCM template struct LoadCode32 {}; @@ -609,7 +609,7 @@ struct LoadCode32<96> { } }; -#endif // USE_ROCM +#endif // USE_AMD_ROCM } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/VectorResidual.cu b/faiss/gpu/impl/VectorResidual.cu index ed24a69e1f..75ffb9ef48 100644 --- a/faiss/gpu/impl/VectorResidual.cu +++ b/faiss/gpu/impl/VectorResidual.cu @@ -8,7 +8,7 @@ #include #include #include -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM #define CUDART_NAN_F __int_as_float(0x7fffffff) #else #include // in CUDA SDK, for CUDART_NAN_F diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt index 2983ddc219..073403e13a 100644 --- a/faiss/gpu/test/CMakeLists.txt +++ b/faiss/gpu/test/CMakeLists.txt @@ -20,11 +20,14 @@ # Defines `gtest_discover_tests()`. include(GoogleTest) add_library(faiss_gpu_test_helper TestUtils.cpp) -if(USE_ROCM) - target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest $<$:hip::host>) +if(FAISS_ENABLE_ROCM) + target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host) else() find_package(CUDAToolkit REQUIRED) - target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$:raft::raft> $<$:raft::compiled>) + target_link_libraries(faiss_gpu_test_helper PUBLIC + faiss gtest CUDA::cudart + $<$:raft::raft> + $<$:raft::compiled>) endif() macro(faiss_gpu_test file) @@ -52,9 +55,9 @@ endif() add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL demo_ivfpq_indexing_gpu.cpp) -if (USE_ROCM) +if (FAISS_ENABLE_ROCM) target_link_libraries(demo_ivfpq_indexing_gpu - PRIVATE faiss gtest_main $<$:hip::host>) + PRIVATE faiss gtest_main hip::host) else() target_link_libraries(demo_ivfpq_indexing_gpu PRIVATE faiss gtest_main CUDA::cudart) diff --git a/faiss/gpu/utils/DeviceDefs.cuh b/faiss/gpu/utils/DeviceDefs.cuh index 521f104265..c89f1cb07e 100644 --- a/faiss/gpu/utils/DeviceDefs.cuh +++ b/faiss/gpu/utils/DeviceDefs.cuh @@ -12,7 +12,7 @@ namespace faiss { namespace gpu { -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM #if __AMDGCN_WAVEFRONT_SIZE == 32u constexpr int kWarpSize = 32; @@ -27,7 +27,7 @@ __forceinline__ __device__ void warpFence() { #define GPU_MAX_SELECTION_K 2048 -#else // USE_ROCM +#else // USE_AMD_ROCM // We require at least CUDA 8.0 for compilation #if CUDA_VERSION < 8000 @@ -56,7 +56,7 @@ __forceinline__ __device__ void warpFence() { #define GPU_MAX_SELECTION_K 1024 #endif -#endif // USE_ROCM +#endif // USE_AMD_ROCM } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/utils/DeviceUtils.cu b/faiss/gpu/utils/DeviceUtils.cu index 1664e218da..7a22c09e8f 100644 --- a/faiss/gpu/utils/DeviceUtils.cu +++ b/faiss/gpu/utils/DeviceUtils.cu @@ -124,7 +124,7 @@ int getDeviceForAddress(const void* p) { return -1; } -#if USE_ROCM +#if USE_AMD_ROCM if (att.type != hipMemoryTypeHost && att.type != hipMemoryTypeUnregistered) { return att.device; diff --git a/faiss/gpu/utils/Float16.cuh b/faiss/gpu/utils/Float16.cuh index 3a676538c3..8ff15df153 100644 --- a/faiss/gpu/utils/Float16.cuh +++ b/faiss/gpu/utils/Float16.cuh @@ -12,7 +12,7 @@ #include // Some compute capabilities have full float16 ALUs. -#if __CUDA_ARCH__ >= 530 || defined(USE_ROCM) +#if __CUDA_ARCH__ >= 530 || defined(USE_AMD_ROCM) #define FAISS_USE_FULL_FLOAT16 1 #endif // __CUDA_ARCH__ types diff --git a/faiss/gpu/utils/Limits.cuh b/faiss/gpu/utils/Limits.cuh index 7a65fbf1b6..9815e2d772 100644 --- a/faiss/gpu/utils/Limits.cuh +++ b/faiss/gpu/utils/Limits.cuh @@ -33,7 +33,7 @@ struct Limits { }; inline __device__ __host__ half kGetHalf(unsigned short v) { -#if CUDA_VERSION >= 9000 || defined(USE_ROCM) +#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM) __half_raw h; h.x = v; return __half(h); diff --git a/faiss/gpu/utils/LoadStoreOperators.cuh b/faiss/gpu/utils/LoadStoreOperators.cuh index e00c5d85df..f86793ddfc 100644 --- a/faiss/gpu/utils/LoadStoreOperators.cuh +++ b/faiss/gpu/utils/LoadStoreOperators.cuh @@ -23,7 +23,7 @@ namespace faiss { namespace gpu { -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM template struct LoadStore { @@ -66,7 +66,7 @@ struct LoadStore { } }; -#else // USE_ROCM +#else // USE_AMD_ROCM template struct LoadStore { @@ -142,7 +142,7 @@ struct LoadStore { } }; -#endif // USE_ROCM +#endif // USE_AMD_ROCM } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/utils/MathOperators.cuh b/faiss/gpu/utils/MathOperators.cuh index 87f779feba..7fe8098a9f 100644 --- a/faiss/gpu/utils/MathOperators.cuh +++ b/faiss/gpu/utils/MathOperators.cuh @@ -282,7 +282,7 @@ struct Math { } static inline __device__ half zero() { -#if CUDA_VERSION >= 9000 || defined(USE_ROCM) +#if CUDA_VERSION >= 9000 || defined(USE_AMD_ROCM) return 0; #else half h; diff --git a/faiss/gpu/utils/MatrixMult-inl.cuh b/faiss/gpu/utils/MatrixMult-inl.cuh index ce9922e071..841df93c1a 100644 --- a/faiss/gpu/utils/MatrixMult-inl.cuh +++ b/faiss/gpu/utils/MatrixMult-inl.cuh @@ -20,7 +20,7 @@ namespace gpu { template struct GetCudaType; -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM template <> struct GetCudaType { static constexpr hipblasDatatype_t Type = HIPBLAS_R_32F; @@ -61,7 +61,7 @@ cublasStatus_t rawGemm( auto cAT = GetCudaType::Type; auto cBT = GetCudaType::Type; -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM return hipblasGemmEx( handle, transa, @@ -135,7 +135,7 @@ cublasStatus_t rawGemm( C, CUDA_R_32F, ldc); -#endif // USE_ROCM +#endif // USE_AMD_ROCM } template @@ -162,7 +162,7 @@ cublasStatus_t rawBatchGemm( auto cBT = GetCudaType::Type; // Always accumulate in f32 -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM return hipblasGemmStridedBatchedEx( handle, transa, diff --git a/faiss/gpu/utils/PtxUtils.cuh b/faiss/gpu/utils/PtxUtils.cuh index a6617aa0b6..d22835464c 100644 --- a/faiss/gpu/utils/PtxUtils.cuh +++ b/faiss/gpu/utils/PtxUtils.cuh @@ -8,14 +8,14 @@ #pragma once #include -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM #include #endif namespace faiss { namespace gpu { -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \ do { \ @@ -51,7 +51,7 @@ __device__ __forceinline__ int getLaneId() { return ::__lane_id(); } -#else // USE_ROCM +#else // USE_AMD_ROCM // defines to simplify the SASS assembly structure file/line in the profiler #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \ @@ -129,7 +129,7 @@ __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) { : "memory"); } -#endif // USE_ROCM +#endif // USE_AMD_ROCM } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/utils/Select.cuh b/faiss/gpu/utils/Select.cuh index f4d05a6cc8..d2b316b6f8 100644 --- a/faiss/gpu/utils/Select.cuh +++ b/faiss/gpu/utils/Select.cuh @@ -207,7 +207,7 @@ struct BlockSelect { __device__ inline void checkThreadQ() { bool needSort = (numVals == NumThreadQ); -#if CUDA_VERSION < 9000 || defined(USE_ROCM) +#if CUDA_VERSION < 9000 || defined(USE_AMD_ROCM) needSort = __any(needSort); #else needSort = __any_sync(0xffffffff, needSort); @@ -484,7 +484,7 @@ struct WarpSelect { __device__ inline void checkThreadQ() { bool needSort = (numVals == NumThreadQ); -#if CUDA_VERSION < 9000 || defined(USE_ROCM) +#if CUDA_VERSION < 9000 || defined(USE_AMD_ROCM) needSort = __any(needSort); #else needSort = __any_sync(0xffffffff, needSort); diff --git a/faiss/gpu/utils/Tensor.cuh b/faiss/gpu/utils/Tensor.cuh index 5cfb19c02c..e16425848d 100644 --- a/faiss/gpu/utils/Tensor.cuh +++ b/faiss/gpu/utils/Tensor.cuh @@ -469,7 +469,7 @@ class SubTensor { /// Use the texture cache for reads __device__ inline typename TensorType::DataType ldg() const { -#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM) return __ldg(data_); #else return *data_; @@ -479,7 +479,7 @@ class SubTensor { /// Use the texture cache for reads; cast as a particular type template __device__ inline T ldgAs() const { -#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM) return __ldg(dataAs()); #else return as(); @@ -605,7 +605,7 @@ class SubTensor { /// Use the texture cache for reads __device__ inline typename TensorType::DataType ldg() const { -#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM) return __ldg(data_); #else return *data_; @@ -615,7 +615,7 @@ class SubTensor { /// Use the texture cache for reads; cast as a particular type template __device__ inline T ldgAs() const { -#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM) return __ldg(dataAs()); #else return as(); diff --git a/faiss/gpu/utils/Transpose.cuh b/faiss/gpu/utils/Transpose.cuh index f5aaa8551e..b6e10efbda 100644 --- a/faiss/gpu/utils/Transpose.cuh +++ b/faiss/gpu/utils/Transpose.cuh @@ -84,7 +84,7 @@ __global__ void transposeAny( auto inputOffset = TensorInfoOffset::get(input, i); auto outputOffset = TensorInfoOffset::get(output, i); -#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +#if __CUDA_ARCH__ >= 350 || defined(USE_AMD_ROCM) output.data[outputOffset] = __ldg(&input.data[inputOffset]); #else output.data[outputOffset] = input.data[inputOffset]; diff --git a/faiss/gpu/utils/WarpShuffles.cuh b/faiss/gpu/utils/WarpShuffles.cuh index 5af6d71ae7..b8dd66f1a8 100644 --- a/faiss/gpu/utils/WarpShuffles.cuh +++ b/faiss/gpu/utils/WarpShuffles.cuh @@ -102,7 +102,7 @@ inline __device__ T* shfl_xor( return (T*)shfl_xor(v, laneMask, width); } -#ifdef USE_ROCM +#ifdef USE_AMD_ROCM inline __device__ half shfl(half v, int srcLane, int width = kWarpSize) { unsigned int vu = __half2uint_rn(v); @@ -139,7 +139,7 @@ inline __device__ half shfl_xor(half v, int laneMask, int width = kWarpSize) { } #endif // CUDA_VERSION -#endif // USE_ROCM +#endif // USE_AMD_ROCM } // namespace gpu } // namespace faiss diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt index 84bc331421..5a01a759e7 100644 --- a/faiss/python/CMakeLists.txt +++ b/faiss/python/CMakeLists.txt @@ -38,10 +38,9 @@ macro(configure_swigfaiss source) set_source_files_properties(${source} PROPERTIES COMPILE_DEFINITIONS GPU_WRAPPER ) - if (USE_ROCM) - message(USE_ROCM="${USE_ROCM}") + if (FAISS_ENABLE_ROCM) set_source_files_properties(${source} PROPERTIES - COMPILE_DEFINITIONS USE_ROCM + COMPILE_DEFINITIONS FAISS_ENABLE_ROCM ) endif() if (FAISS_ENABLE_RAFT) @@ -67,23 +66,34 @@ if(TARGET faiss) # Manually add headers as extra dependencies of swigfaiss. set(SWIG_MODULE_swigfaiss_EXTRA_DEPS) foreach(h ${FAISS_HEADERS}) - list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") - list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") endforeach() - if(USE_ROCM) + if(FAISS_ENABLE_ROCM) foreach(h ${FAISS_GPU_HEADERS}) - list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") endforeach() else() foreach(h ${FAISS_GPU_HEADERS}) - list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") - list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") - list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx2_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") endforeach() endif() else() @@ -157,20 +167,27 @@ else() endif() if(FAISS_ENABLE_GPU) - if(USE_ROCM) - find_package(HIP REQUIRED) - target_link_libraries(swigfaiss PRIVATE $<$:hip::host>) - target_link_libraries(swigfaiss_avx2 PRIVATE $<$:hip::host>) - target_link_libraries(swigfaiss_avx512 PRIVATE $<$:hip::host>) + if(FAISS_ENABLE_ROCM) + target_link_libraries(swigfaiss PRIVATE hip::host) + target_link_libraries(swigfaiss_avx2 PRIVATE hip::host) + target_link_libraries(swigfaiss_avx512 PRIVATE hip::host) else() find_package(CUDAToolkit REQUIRED) if(FAISS_ENABLE_RAFT) find_package(raft COMPONENTS compiled distributed) endif() - target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) - target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) - target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) - target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$:raft::raft> $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) + target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart + $<$:raft::raft> + $<$:nvidia::cutlass::cutlass>) endif() endif() diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig index b507843f3c..43c8d6c422 100644 --- a/faiss/python/swigfaiss.swig +++ b/faiss/python/swigfaiss.swig @@ -295,7 +295,7 @@ void gpu_profiler_stop(); void gpu_sync_all_devices(); #ifdef GPU_WRAPPER -#ifdef USE_ROCM +#ifdef FAISS_ENABLE_ROCM %shared_ptr(faiss::gpu::GpuResources); %shared_ptr(faiss::gpu::StandardGpuResourcesImpl); @@ -365,7 +365,7 @@ int64_t cast_cudastream_t_to_integer(hipStream_t x) { %} -#else // USE_ROCM +#else // FAISS_ENABLE_ROCM %shared_ptr(faiss::gpu::GpuResources); %shared_ptr(faiss::gpu::StandardGpuResourcesImpl); @@ -438,7 +438,7 @@ int64_t cast_cudastream_t_to_integer(cudaStream_t x) { %} -#endif // USE_ROCM +#endif // FAISS_ENABLE_ROCM #else // GPU_WRAPPER %{ @@ -631,7 +631,7 @@ struct faiss::simd16uint16 {}; #ifdef GPU_WRAPPER -#ifdef USE_ROCM +#ifdef FAISS_ENABLE_ROCM // quiet SWIG warnings %ignore faiss::gpu::GpuIndexIVF::GpuIndexIVF; @@ -648,7 +648,7 @@ struct faiss::simd16uint16 {}; %include %include -#else // USE_ROCM +#else // FAISS_ENABLE_ROCM // quiet SWIG warnings %ignore faiss::gpu::GpuIndexIVF::GpuIndexIVF; @@ -668,7 +668,7 @@ struct faiss::simd16uint16 {}; %include %include -#endif // USE_ROCM +#endif // FAISS_ENABLE_ROCM #endif @@ -904,7 +904,7 @@ faiss::Quantizer * downcast_Quantizer (faiss::Quantizer *aq) #ifdef GPU_WRAPPER -#ifdef USE_ROCM +#ifdef FAISS_ENABLE_ROCM %include %newobject index_gpu_to_cpu; @@ -913,7 +913,7 @@ faiss::Quantizer * downcast_Quantizer (faiss::Quantizer *aq) %include -#else // USE_ROCM +#else // FAISS_ENABLE_ROCM %include %newobject index_gpu_to_cpu; @@ -922,7 +922,7 @@ faiss::Quantizer * downcast_Quantizer (faiss::Quantizer *aq) %include -#endif // USE_ROCM +#endif // FAISS_ENABLE_ROCM #endif