From cff1aeda3dac72396fac92a2bf574741ea968a9b Mon Sep 17 00:00:00 2001 From: "Frank Lin (Engrg-Hardware 1)" Date: Fri, 16 Feb 2024 05:16:01 +0000 Subject: [PATCH 1/2] fix test_graph_reindex --- cmake/external/cccl.cmake | 6 ++ .../phi/kernels/gpu/graph_reindex_kernel.cu | 59 +++++++------------ patches/cccl/util_device.cuh.patch | 22 +++++++ 3 files changed, 48 insertions(+), 39 deletions(-) create mode 100644 patches/cccl/util_device.cuh.patch diff --git a/cmake/external/cccl.cmake b/cmake/external/cccl.cmake index db09c01f92e742..18b9d010adde3a 100755 --- a/cmake/external/cccl.cmake +++ b/cmake/external/cccl.cmake @@ -15,12 +15,18 @@ set(CCCL_INCLUDE_DIR ${CCCL_SOURCE_DIR}) message("CCCL_INCLUDE_DIR is ${CCCL_INCLUDE_DIR}") include_directories(${CCCL_INCLUDE_DIR}) +file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/cccl/util_device.cuh.patch + native_src) +set(CCCL_PATCH_COMMAND git checkout -- . && git checkout ${CCCL_TAG} && patch + -p1 -Nd ${CCCL_SOURCE_DIR} < ${native_src}) + ExternalProject_Add( extern_cccl ${EXTERNAL_PROJECT_LOG_ARGS} SOURCE_DIR ${CCCL_SOURCE_DIR} PREFIX ${CCCL_PREFIX_DIR} UPDATE_COMMAND "" + PATCH_COMMAND ${CCCL_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu index c0454619b657ca..c1f635bfdf8aab 100644 --- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu @@ -67,53 +67,34 @@ std::shared_ptr FillHashTable(const Context& dev_ctx, input, num_input, len_hashtable, keys, key_index); // Get item index count. - auto item_count = - phi::memory_utils::Alloc(place, (num_input + 1) * sizeof(int)); - int* item_count_ptr = reinterpret_cast(item_count->ptr()); -#ifdef PADDLE_WITH_HIP - hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1)); -#else - cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1)); -#endif + thrust::device_vector item_count(num_input + 1, 0); GetItemIndexCount<<>>( - input, item_count_ptr, num_input, len_hashtable, keys, key_index); - - size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, item_count_ptr, item_count_ptr, num_input + 1); - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); - cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), - temp_storage_bytes, - item_count_ptr, - item_count_ptr, - num_input + 1); - int total_unique_items = 0; -#ifdef PADDLE_WITH_HIP - hipMemcpy(&total_unique_items, - item_count_ptr + num_input, - sizeof(int), - hipMemcpyDeviceToHost); -#else - cudaMemcpy(&total_unique_items, - item_count_ptr + num_input, - sizeof(int), - cudaMemcpyDeviceToHost); -#endif + input, + thrust::raw_pointer_cast(item_count.data()), + num_input, + len_hashtable, + keys, + key_index); + thrust::exclusive_scan( + item_count.begin(), item_count.end(), item_count.begin()); + + int total_unique_items = item_count[num_input]; auto unique_items = phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T)); T* unique_items_data = reinterpret_cast(unique_items->ptr()); *final_nodes_len = total_unique_items; // Get unique items - FillUniqueItems<<>>(input, - num_input, - len_hashtable, - unique_items_data, - item_count_ptr, - keys, - values, - key_index); + FillUniqueItems<<>>( + input, + num_input, + len_hashtable, + unique_items_data, + thrust::raw_pointer_cast(item_count.data()), + keys, + values, + key_index); return unique_items; } diff --git a/patches/cccl/util_device.cuh.patch b/patches/cccl/util_device.cuh.patch new file mode 100644 index 00000000000000..0f0a4b0e4def37 --- /dev/null +++ b/patches/cccl/util_device.cuh.patch @@ -0,0 +1,22 @@ +diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh +index c7e15cafe..988cb9746 100644 +--- a/cub/cub/util_device.cuh ++++ b/cub/cub/util_device.cuh +@@ -278,7 +278,7 @@ public: + /** + * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). + */ +-CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) ++CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version) + { + // Instantiate `EmptyKernel` in both host and device code to ensure + // it can be called. +@@ -593,7 +593,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) + * + */ + template +-CUB_RUNTIME_FUNCTION inline ++CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t MaxSmOccupancy( + int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy From 18a24af9983848c492d02014db2f62b47d2615df Mon Sep 17 00:00:00 2001 From: "Tian Zheng (Engrg-Hardware 1)" Date: Mon, 26 Feb 2024 23:17:37 -0800 Subject: [PATCH 2/2] Fix test_weight_decay --- patches/cccl/util_device.cuh.patch | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/patches/cccl/util_device.cuh.patch b/patches/cccl/util_device.cuh.patch index 0f0a4b0e4def37..bdf7165328d503 100644 --- a/patches/cccl/util_device.cuh.patch +++ b/patches/cccl/util_device.cuh.patch @@ -1,5 +1,5 @@ diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh -index c7e15cafe..988cb9746 100644 +index c7e15cafe..756336914 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -278,7 +278,7 @@ public: @@ -11,6 +11,15 @@ index c7e15cafe..988cb9746 100644 { // Instantiate `EmptyKernel` in both host and device code to ensure // it can be called. +@@ -375,7 +375,7 @@ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device) + * + * \note This function is thread safe. + */ +-CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version) ++CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) + { + cudaError_t result = cudaErrorUnknown; + NV_IF_TARGET( @@ -593,7 +593,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) * */