From cff1aeda3dac72396fac92a2bf574741ea968a9b Mon Sep 17 00:00:00 2001
From: "Frank Lin (Engrg-Hardware 1)" <fralin@nvidia.com>
Date: Fri, 16 Feb 2024 05:16:01 +0000
Subject: [PATCH 1/2] fix test_graph_reindex

---
 cmake/external/cccl.cmake                     |  6 ++
 .../phi/kernels/gpu/graph_reindex_kernel.cu   | 59 +++++++------------
 patches/cccl/util_device.cuh.patch            | 22 +++++++
 3 files changed, 48 insertions(+), 39 deletions(-)
 create mode 100644 patches/cccl/util_device.cuh.patch
diff --git a/cmake/external/cccl.cmake b/cmake/external/cccl.cmake
index db09c01f92e742..18b9d010adde3a 100755
--- a/cmake/external/cccl.cmake
+++ b/cmake/external/cccl.cmake
@@ -15,12 +15,18 @@ set(CCCL_INCLUDE_DIR ${CCCL_SOURCE_DIR})
 message("CCCL_INCLUDE_DIR is ${CCCL_INCLUDE_DIR}")
 include_directories(${CCCL_INCLUDE_DIR})
 
+file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/cccl/util_device.cuh.patch
+     native_src)
+set(CCCL_PATCH_COMMAND git checkout -- . && git checkout ${CCCL_TAG} && patch
+                       -p1 -Nd ${CCCL_SOURCE_DIR} < ${native_src})
+
 ExternalProject_Add(
   extern_cccl
   ${EXTERNAL_PROJECT_LOG_ARGS}
   SOURCE_DIR ${CCCL_SOURCE_DIR}
   PREFIX ${CCCL_PREFIX_DIR}
   UPDATE_COMMAND ""
+  PATCH_COMMAND ${CCCL_PATCH_COMMAND}
   CONFIGURE_COMMAND ""
   BUILD_COMMAND ""
   INSTALL_COMMAND ""
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index c0454619b657ca..c1f635bfdf8aab 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -67,53 +67,34 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
       input, num_input, len_hashtable, keys, key_index);
 
   // Get item index count.
-  auto item_count =
-      phi::memory_utils::Alloc(place, (num_input + 1) * sizeof(int));
-  int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
-#ifdef PADDLE_WITH_HIP
-  hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
-#else
-  cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
-#endif
+  thrust::device_vector<int> item_count(num_input + 1, 0);
   GetItemIndexCount<T><<<grid, block, 0, dev_ctx.stream()>>>(
-      input, item_count_ptr, num_input, len_hashtable, keys, key_index);
-
-  size_t temp_storage_bytes = 0;
-  cub::DeviceScan::ExclusiveSum(
-      NULL, temp_storage_bytes, item_count_ptr, item_count_ptr, num_input + 1);
-  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
-  cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
-                                temp_storage_bytes,
-                                item_count_ptr,
-                                item_count_ptr,
-                                num_input + 1);
-  int total_unique_items = 0;
-#ifdef PADDLE_WITH_HIP
-  hipMemcpy(&total_unique_items,
-            item_count_ptr + num_input,
-            sizeof(int),
-            hipMemcpyDeviceToHost);
-#else
-  cudaMemcpy(&total_unique_items,
-             item_count_ptr + num_input,
-             sizeof(int),
-             cudaMemcpyDeviceToHost);
-#endif
+      input,
+      thrust::raw_pointer_cast(item_count.data()),
+      num_input,
+      len_hashtable,
+      keys,
+      key_index);
 
+  thrust::exclusive_scan(
+      item_count.begin(), item_count.end(), item_count.begin());
+
+  int total_unique_items = item_count[num_input];
   auto unique_items =
       phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T));
   T* unique_items_data = reinterpret_cast<T*>(unique_items->ptr());
   *final_nodes_len = total_unique_items;
 
   // Get unique items
-  FillUniqueItems<T><<<grid, block, 0, dev_ctx.stream()>>>(input,
-                                                           num_input,
-                                                           len_hashtable,
-                                                           unique_items_data,
-                                                           item_count_ptr,
-                                                           keys,
-                                                           values,
-                                                           key_index);
+  FillUniqueItems<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      input,
+      num_input,
+      len_hashtable,
+      unique_items_data,
+      thrust::raw_pointer_cast(item_count.data()),
+      keys,
+      values,
+      key_index);
   return unique_items;
 }
 
diff --git a/patches/cccl/util_device.cuh.patch b/patches/cccl/util_device.cuh.patch
new file mode 100644
index 00000000000000..0f0a4b0e4def37
--- /dev/null
+++ b/patches/cccl/util_device.cuh.patch
@@ -0,0 +1,22 @@
+diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
+index c7e15cafe..988cb9746 100644
+--- a/cub/cub/util_device.cuh
++++ b/cub/cub/util_device.cuh
+@@ -278,7 +278,7 @@ public:
+ /**
+  * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+  */
+-CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
++CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
+ {
+     // Instantiate `EmptyKernel<void>` in both host and device code to ensure
+     // it can be called.
+@@ -593,7 +593,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
+  *
+  */
+ template <typename KernelPtr>
+-CUB_RUNTIME_FUNCTION inline
++CUB_RUNTIME_FUNCTION __forceinline__
+ cudaError_t MaxSmOccupancy(
+     int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+     KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy

From 18a24af9983848c492d02014db2f62b47d2615df Mon Sep 17 00:00:00 2001
From: "Tian Zheng (Engrg-Hardware 1)" <tizheng@nvidia.com>
Date: Mon, 26 Feb 2024 23:17:37 -0800
Subject: [PATCH 2/2] Fix test_weight_decay

---
 patches/cccl/util_device.cuh.patch | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/patches/cccl/util_device.cuh.patch b/patches/cccl/util_device.cuh.patch
index 0f0a4b0e4def37..bdf7165328d503 100644
--- a/patches/cccl/util_device.cuh.patch
+++ b/patches/cccl/util_device.cuh.patch
@@ -1,5 +1,5 @@
 diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
-index c7e15cafe..988cb9746 100644
+index c7e15cafe..756336914 100644
 --- a/cub/cub/util_device.cuh
 +++ b/cub/cub/util_device.cuh
 @@ -278,7 +278,7 @@ public:
@@ -11,6 +11,15 @@ index c7e15cafe..988cb9746 100644
  {
      // Instantiate `EmptyKernel<void>` in both host and device code to ensure
      // it can be called.
+@@ -375,7 +375,7 @@ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
+  *
+  * \note This function is thread safe.
+  */
+-CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version)
++CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+ {
+   cudaError_t result = cudaErrorUnknown;
+   NV_IF_TARGET(
 @@ -593,7 +593,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
   *
   */