diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 36189cc7e4c90d..73704b04cf90b2 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -217,6 +217,7 @@ class RecordedGpuMallocHelper {
     CUDADeviceGuard guard(dev_id_);
     gpuError_t result;
 #ifdef PADDLE_WITH_HIP
+    phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
       result = hipMallocManaged(ptr, size);
     } else {
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index ad30da4ddcd6f0..03da0544500920 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -355,7 +355,7 @@ namespace phi {
                  "`");                                                        \
     }                                                                         \
   }()
-#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_XPU)
 #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index abe752e27fd391..891888bf8b5850 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -209,11 +209,9 @@ if(WITH_ROCM)
     "gpu/lu_kernel.cu"
     "gpu/matrix_rank_kernel.cu"
     "gpu/matrix_rank_tol_kernel.cu"
-    "gpu/multiclass_nms3_kernel.cu"
     "gpu/put_along_axis_grad_kernel.cu"
     "gpu/put_along_axis_kernel.cu"
     "gpu/qr_kernel.cu"
-    "gpu/rms_norm_grad_kernel.cu"
     "gpu/svd_kernel.cu"
     "gpudnn/mha_cudnn_frontend.cu"
     "fusion/gpu/block_multi_head_attention_kernel.cu"
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 463272a37c00d3..855b6fe6c8e15c 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -349,19 +349,6 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-#ifdef PADDLE_WITH_HIP
-      VectorizedRandomGenerator<T>
-          <<<grid_size, block_size, 0, stream>>>(0,
-                                                 size,
-                                                 seed_data,
-                                                 dropout_prob,
-                                                 x_data,
-                                                 mask_data,
-                                                 y_data,
-                                                 upscale_in_train,
-                                                 increment,
-                                                 main_offset);
-#else
       const phi::GPUContext* dev_ctx_p = &dev_ctx;
       auto gen_cuda = dev_ctx.GetGenerator();
       auto state_index = gen_cuda->GetStateIndex();
@@ -370,10 +357,11 @@ void DropoutFwGPUKernelDriver(
           parameterSetter = [offset, dev_ctx_p, state_index, is_fix_seed](
                                 phi::backends::gpu::gpuKernelParams& params) {
             if (!is_fix_seed) {
-              // we assume seed is null pointer
-              // seed copy to cpu is meaningless here
+          // we assume seed is null pointer
+          // seed copy to cpu is meaningless here
+#ifndef PADDLE_WITH_HIP
               assert(seed_tensor_ptr == nullptr);
-
+#endif
               auto gen_cuda = dev_ctx_p->GetGenerator();
               // ensure the generator use correct state index
               gen_cuda->SetStateIndex(state_index);
@@ -393,9 +381,14 @@ void DropoutFwGPUKernelDriver(
           cudaKernelCallback = [=](unsigned int id) {
             void* functionPtr =
                 reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
+#ifdef PADDLE_WITH_HIP
+            hipFunction_t cudaFunc =
+                reinterpret_cast<hipFunction_t>(functionPtr);
+#else
             cudaFunction_t cudaFunc;
             PADDLE_ENFORCE_GPU_SUCCESS(
                 cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
             VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                      << " functionPtr = " << functionPtr;
 
@@ -417,7 +410,6 @@ void DropoutFwGPUKernelDriver(
 
       VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
                << ", increment = " << increment;
-#endif
     }
   } else {
     if (upscale_in_train) {
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 6a82875819161b..3eee52efcbebe6 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -166,14 +166,14 @@ __inline__ __device__ double rsqrt_(const double val) {
   return ::rsqrt(val);
 }
 
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) || defined(PADDLE_WITH_HIP)
 template <>
 __inline__ __device__ half rsqrt_(const half val) {
   return hrsqrt(val);
 }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T,
           typename U,
           typename ScaleT = U,
@@ -254,7 +254,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      mu_local += __shfl_xor(mu_local, it);
+#else
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -290,7 +294,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      var_local += __shfl_xor(var_local, it);
+#else
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+#endif
     }
 
     if (WARPS_N > 1) {
@@ -546,7 +554,7 @@ __inline__ __device__ void cuLoadAddStridedInputs(const int64_t i1_block,
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <bool IsFusedDropoutResidualLn,
           bool NeedDDropoutSrcPtr,
           typename T,
@@ -678,16 +686,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_fast_kernel(
 #pragma unroll
       // row reduction among 32 threads.
       for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+        sum_loss1 += __shfl_xor(sum_loss1, it);
+        sum_loss2 += __shfl_xor(sum_loss2, it);
+#else
         sum_loss1 += __shfl_xor_sync(uint32_t(-1), sum_loss1, it);
         sum_loss2 += __shfl_xor_sync(uint32_t(-1), sum_loss2, it);
+#endif
       }
       sum_loss1 *= rn;
       sum_loss2 *= rn;
     } else {
 #pragma unroll
       for (int it = 16; it > 0; it /= 2) {
+#ifdef PADDLE_WITH_HIP
+        sum_loss1 += __shfl_down(sum_loss1, it);
+        sum_loss2 += __shfl_down(sum_loss2, it);
+#else
         sum_loss1 += __shfl_down_sync(uint32_t(-1), sum_loss1, it);
         sum_loss2 += __shfl_down_sync(uint32_t(-1), sum_loss2, it);
+#endif
       }
 
       if (lane == 0) {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
index 60a82cfe7c1980..48819c12a8dc0e 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,7 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
 #include <cuda_fp16.h>
 #include <cub/cub.cuh>
 #endif
@@ -21,9 +26,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
-#endif
 
 namespace phi {
 namespace fusion {
@@ -51,7 +54,6 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
-#ifndef PADDLE_WITH_HIP
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
@@ -114,15 +116,19 @@ void FusedBiasDropoutResidualLnGradKernel(
       d_x_data,
       d_bias_data,
       d_residual_data);
-#else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
-#endif
 }
 
 }  // namespace fusion
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBiasDropoutResidualLnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
 PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -130,3 +136,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
index 37450d3a4e178b..ca0bcbe7f2466a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,9 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
-#endif
 
 namespace phi {
 namespace fusion {
@@ -42,7 +40,6 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
-#ifndef PADDLE_WITH_HIP
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
@@ -95,14 +92,20 @@ void FusedBiasDropoutResidualLnKernel(
       y_data,
       ln_mean_data,
       ln_var_data);
-#else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "FusedBiasDropoutResidualLnKernel not support for rocm"));
-#endif
 }
 }  // namespace fusion
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBiasDropoutResidualLnKernel,
+                   float,
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
+}
+#else
 PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -112,3 +115,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
                    phi::dtype::float16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
index e5f5c9ba50ba45..d2cd2f1b545a7c 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
@@ -35,7 +35,11 @@ struct GeluFunctor {
 template <typename T>
 struct FastGeluFunctor {
   inline __device__ T operator()(const T x) const {
+#ifdef PADDLE_WITH_HIP
+    assert(0 && "ROCM does not support FastGelu");
+#else
     return phi::GeluFwd<T, true>(x);
+#endif
   }
 };
 
@@ -92,8 +96,8 @@ __global__ void FusedDropoutActBias(
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
 
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
+  GPURAND(StatePhilox4_32_10_t) state;
+  GPURAND(_init)(seed, idx, increment, &state);
 
   const T factor =
       phi::fusion::GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 801f070251fb2c..8994d521382335 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -202,18 +202,6 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
-#ifdef PADDLE_WITH_HIP
-    VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
-        <<<grid_size, block_size, 0, stream>>>(0,
-                                               numel,
-                                               seed_data,  //  idx: 2 need save
-                                               x_grad_data,
-                                               y_grad_data,
-                                               out_grad_data,
-                                               increment,  //  idx: 6 need save
-                                               main_offset,
-                                               functor);
-#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -233,9 +221,13 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>));
+#ifdef PADDLE_WITH_HIP
+          hipFunction_t cudaFunc = reinterpret_cast<hipFunction_t>(functionPtr);
+#else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
           VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                    << " functionPtr = " << functionPtr;
 
@@ -257,7 +249,6 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
-#endif
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index c95c5fbf0ca3de..54ec3604bbee93 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -186,18 +186,6 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
-#ifdef PADDLE_WITH_HIP
-    VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
-        <<<grid_size, block_size, 0, stream>>>(0,
-                                               numel,
-                                               seed_data,  // need save
-                                               x_data,
-                                               y_data,
-                                               out_data,
-                                               increment,  // need save
-                                               main_offset,
-                                               dst_functor);
-#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -237,9 +225,13 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
+#ifdef PADDLE_WITH_HIP
+          hipFunction_t cudaFunc = reinterpret_cast<hipFunction_t>(functionPtr);
+#else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
           VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                    << " functionPtr = " << functionPtr;
 
@@ -260,7 +252,6 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
-#endif
   } else {
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     MT factor = static_cast<MT>(1.0f - dropout_rate);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
index 2ef46378b1b9bd..ef9ecbb435fdba 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
@@ -20,10 +20,25 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
 
+#ifdef PADDLE_WITH_HIP
+#define GPU(str) hip##str
+#define GPURAND(str) hiprand##str
+#else
+#define GPU(str) cuda##str
+#define GPURAND(str) curand##str
+#endif
+
 namespace phi {
 namespace fusion {
 
@@ -63,26 +78,29 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
 }
 
 template <int VecSize>
-__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec(GPURAND(StatePhilox4_32_10_t) * state,
                                         float *data);
 
 template <>
-__forceinline__ __device__ void RandVec<1>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<1>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  data[0] = curand_uniform(state);
+  data[0] = GPURAND(_uniform)(state);
 }
 
 template <>
-__forceinline__ __device__ void RandVec<2>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<2>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  data[0] = curand_uniform(state);
-  data[1] = curand_uniform(state);
+  data[0] = GPURAND(_uniform)(state);
+  data[1] = GPURAND(_uniform)(state);
 }
 
 template <>
-__forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<4>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  float4 rand4 = curand_uniform4(state);
+  float4 rand4 = GPURAND(_uniform4)(state);
   data[0] = rand4.x;
   data[1] = rand4.y;
   data[2] = rand4.w;
@@ -90,7 +108,8 @@ __forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state,
 }
 
 template <>
-__forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<8>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
   RandVec<4>(state, data);
   RandVec<4>(state, data + 4);
@@ -99,7 +118,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
 template <typename T>
 inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) {
   PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream()));
+      GPU(MemsetAsync)(ptr, 0, size * sizeof(T), ctx.stream()));
 }
 
 /**
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
index e31b24e7e105e5..221019531a5486 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -38,10 +38,19 @@ limitations under the License.
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
-#include <cub/cub.cuh>
 #include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#else
+#include <cub/cub.cuh>
+#define GPU(str) cuda##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
 #endif
 
 namespace phi {
@@ -50,9 +59,11 @@ namespace fusion {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
-
+#ifdef PADDLE_WITH_HIP
+constexpr int kWarpSize = 64;
+#else
 constexpr int kWarpSize = 32;
+#endif
 
 template <typename T>
 struct SumOp {
@@ -74,7 +85,11 @@ template <template <typename> class ReductionOp,
 __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
     val = ReductionOp<T>()(
+#ifdef PADDLE_WITH_HIP
+        val, __shfl_xor(val, mask, thread_group_width));
+#else
         val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+#endif
   }
   return val;
 }
@@ -97,7 +112,7 @@ __inline__ __device__ T Div(T a, T b);
 
 template <>
 __inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
+#if defined(OF_LAYER_NORM_USE_FAST_MATH) || defined(PADDLE_WITH_HIP)
   return __fdividef(a, b);
 #else
   return a / b;
@@ -114,7 +129,7 @@ __inline__ __device__ T Rsqrt(T x);
 
 template <>
 __inline__ __device__ float Rsqrt<float>(float x) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
+#if defined(OF_LAYER_NORM_USE_FAST_MATH) || defined(PADDLE_WITH_HIP)
   return __frsqrt_rn(x);
 #else
   return rsqrt(x);
@@ -127,35 +142,36 @@ __inline__ __device__ double Rsqrt<double>(double x) {
 }
 
 template <class Func>
-inline cudaError_t GetNumBlocks(Func func,
-                                int64_t block_size,
-                                size_t dynamic_smem_size,
-                                int64_t max_blocks,
-                                int64_t waves,
-                                int* num_blocks) {
+inline GPU(Error_t) GetNumBlocks(Func func,
+                                 int64_t block_size,
+                                 size_t dynamic_smem_size,
+                                 int64_t max_blocks,
+                                 int64_t waves,
+                                 int* num_blocks) {
   int dev;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int sm_count;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int max_active_blocks;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks, func, block_size, dynamic_smem_size);
   }
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename T>
@@ -279,9 +295,15 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+#else
     T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
     T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
     T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+#endif
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -291,9 +313,15 @@ __inline__ __device__ void WelfordWarpAllReduce(
     T thread_mean, T thread_m2, T thread_count, T* mean, T* m2, T* count) {
   WelfordWarpReduce<T, thread_group_width>(
       thread_mean, thread_m2, thread_count, mean, m2, count);
+#ifdef PADDLE_WITH_HIP
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+#else
   *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
   *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
   *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+#endif
 }
 
 template <typename T, int thread_group_width = kWarpSize>
@@ -301,7 +329,11 @@ __inline__ __device__ T WarpReduceSum(T x) {
   T result = 0.0f;
 #pragma unroll
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    result += __shfl_xor(x, mask, thread_group_width);
+#else
     result += __shfl_xor_sync(0xffffffff, x, mask, thread_group_width);
+#endif
   }
   return result;
 }
@@ -343,7 +375,11 @@ __inline__ __device__ void WelfordBlockAllReduce(T thread_mean,
       warp_m2 = static_cast<T>(0);
       warp_count = static_cast<T>(0);
     }
+#ifdef PADDLE_WITH_HIP
+    __syncthreads();
+#else
     __syncwarp();
+#endif
     T block_mean = 0;
     T block_m2 = 0;
     T block_count = 0;
@@ -429,63 +465,75 @@ template <typename LOAD,
           typename ComputeType,
           int pack_size,
           int block_size>
-inline cudaError_t LaunchLayerNormBlockSMemImpl(cudaStream_t stream,
-                                                LOAD load,
-                                                STORE store,
-                                                int smem,
-                                                const int64_t rows,
-                                                const int64_t cols,
-                                                const double epsilon,
-                                                ComputeType* mean,
-                                                ComputeType* inv_variance,
-                                                ComputeType col_divisor) {
+inline GPU(Error_t) LaunchLayerNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                 LOAD load,
+                                                 STORE store,
+                                                 int smem,
+                                                 const int64_t rows,
+                                                 const int64_t cols,
+                                                 const double epsilon,
+                                                 ComputeType* mean,
+                                                 ComputeType* inv_variance,
+                                                 ComputeType col_divisor) {
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(
+    GPU(Error_t)
+    err = GetNumBlocks(
         LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
         block_size,
         smem,
         rows,
         waves,
         &grid_dim_x);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
       <<<grid_dim_x, block_size, smem, stream>>>(
           load, store, rows, cols, epsilon, mean, inv_variance, col_divisor);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename Func>
-cudaError_t MaximizeDynamicSharedMemorySize(Func func,
-                                            const int max_smem_size) {
-  cudaFuncAttributes attr{};
+GPU(Error_t)
+MaximizeDynamicSharedMemorySize(Func func, const int max_smem_size) {
+  GPU(FuncAttributes) attr{};
+#ifdef PADDLE_WITH_HIP
+  hipError_t err = hipFuncGetAttributes(&attr, (const void*)func);
+#else
   cudaError_t err = cudaFuncGetAttributes(&attr, func);
-  if (err != cudaSuccess) {
+#endif
+  if (err != GPU(Success)) {
     return err;
   }
   constexpr int reserved_smem = 1024;  // 1K
+#ifdef PADDLE_WITH_HIP
+  return hipFuncSetAttribute(
+      (const void*)func,
+      hipFuncAttributeMaxDynamicSharedMemorySize,
+      max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#else
   return cudaFuncSetAttribute(
       func,
       cudaFuncAttributeMaxDynamicSharedMemorySize,
       max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#endif
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int64_t rows,
-    const int64_t cols,
-    const double epsilon,
-    ComputeType* mean,
-    ComputeType* inv_variance,
-    ComputeType col_divisor,
-    bool* success) {
+inline GPU(Error_t)
+    TryDispatchLayerNormBlockSMemImplBlockSize(GPU(Stream_t) stream,
+                                               LOAD load,
+                                               STORE store,
+                                               const int64_t rows,
+                                               const int64_t cols,
+                                               const double epsilon,
+                                               ComputeType* mean,
+                                               ComputeType* inv_variance,
+                                               ComputeType col_divisor,
+                                               bool* success) {
   // Note(Zhengzekang): We choose a fixed blocksize to avoid layernorm diff, by
   // RichardWooSJTU.
 
@@ -493,8 +541,8 @@ inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
 
   int dev = 0;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -520,16 +568,17 @@ inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct TryDispatchLayerNormBlockSMemImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int64_t rows,
-                         const int64_t cols,
-                         const double epsilon,
-                         ComputeType* mean,
-                         ComputeType* inv_variance,
-                         ComputeType col_divisor,
-                         bool* success) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int64_t rows,
+             const int64_t cols,
+             const double epsilon,
+             ComputeType* mean,
+             ComputeType* inv_variance,
+             ComputeType col_divisor,
+             bool* success) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD,
@@ -579,16 +628,16 @@ struct TryDispatchLayerNormBlockSMemImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t TryDispatchLayerNormBlockSMemImpl(cudaStream_t stream,
-                                                     LOAD load,
-                                                     STORE store,
-                                                     const int64_t rows,
-                                                     const int64_t cols,
-                                                     const double epsilon,
-                                                     ComputeType* mean,
-                                                     ComputeType* inv_variance,
-                                                     ComputeType col_divisor,
-                                                     bool* success) {
+inline GPU(Error_t) TryDispatchLayerNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                      LOAD load,
+                                                      STORE store,
+                                                      const int64_t rows,
+                                                      const int64_t cols,
+                                                      const double epsilon,
+                                                      ComputeType* mean,
+                                                      ComputeType* inv_variance,
+                                                      ComputeType col_divisor,
+                                                      bool* success) {
   return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
       stream,
       load,
@@ -663,48 +712,51 @@ __global__ void __launch_bounds__(1024)
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline cudaError_t LaunchLayerNormBlockUncachedImpl(cudaStream_t stream,
-                                                    LOAD load,
-                                                    STORE store,
-                                                    const int64_t rows,
-                                                    const int64_t cols,
-                                                    const double epsilon,
-                                                    ComputeType* mean,
-                                                    ComputeType* inv_variance) {
+inline GPU(Error_t)
+    LaunchLayerNormBlockUncachedImpl(GPU(Stream_t) stream,
+                                     LOAD load,
+                                     STORE store,
+                                     const int64_t rows,
+                                     const int64_t cols,
+                                     const double epsilon,
+                                     ComputeType* mean,
+                                     ComputeType* inv_variance) {
   constexpr int block_size = 1024;
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(LayerNormBlockUncachedImpl<LOAD,
-                                                              STORE,
-                                                              ComputeType,
-                                                              pack_size,
-                                                              block_size>,
-                                   block_size,
-                                   0,
-                                   rows,
-                                   waves,
-                                   &grid_dim_x);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GetNumBlocks(LayerNormBlockUncachedImpl<LOAD,
+                                                  STORE,
+                                                  ComputeType,
+                                                  pack_size,
+                                                  block_size>,
+                       block_size,
+                       0,
+                       rows,
+                       waves,
+                       &grid_dim_x);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
       <<<grid_dim_x, block_size, 0, stream>>>(
           load, store, rows, cols, epsilon, mean, inv_variance);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct DispatchLayerNormBlockUncachedImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int64_t rows,
-                         const int64_t cols,
-                         const double epsilon,
-                         ComputeType* mean,
-                         ComputeType* inv_variance) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int64_t rows,
+             const int64_t cols,
+             const double epsilon,
+             ComputeType* mean,
+             ComputeType* inv_variance) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
@@ -721,23 +773,23 @@ struct DispatchLayerNormBlockUncachedImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t DispatchLayerNormBlockUncachedImpl(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int64_t rows,
-    const int64_t cols,
-    const double epsilon,
-    ComputeType* mean,
-    ComputeType* inv_variance) {
+inline GPU(Error_t)
+    DispatchLayerNormBlockUncachedImpl(GPU(Stream_t) stream,
+                                       LOAD load,
+                                       STORE store,
+                                       const int64_t rows,
+                                       const int64_t cols,
+                                       const double epsilon,
+                                       ComputeType* mean,
+                                       ComputeType* inv_variance) {
   return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
       stream, load, store, rows, cols, epsilon, mean, inv_variance);
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<!std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchLayerNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchLayerNorm(GPU(Stream_t) stream,
                   LOAD load,
                   STORE store,
                   const int64_t rows,
@@ -748,19 +800,19 @@ DispatchLayerNorm(cudaStream_t stream,
   const ComputeType col_divisor = 1.0f / cols;
   bool dispatch_smem_impl_success;
   {
-    cudaError_t err =
-        TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
-            stream,
-            load,
-            store,
-            rows,
-            cols,
-            epsilon,
-            mean,
-            inv_variance,
-            col_divisor,
-            &dispatch_smem_impl_success);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+        stream,
+        load,
+        store,
+        rows,
+        cols,
+        epsilon,
+        mean,
+        inv_variance,
+        col_divisor,
+        &dispatch_smem_impl_success);
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -768,13 +820,13 @@ DispatchLayerNorm(cudaStream_t stream,
     return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
         stream, load, store, rows, cols, epsilon, mean, inv_variance);
   }
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchLayerNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchLayerNorm(GPU(Stream_t) stream,
                   LOAD load,
                   STORE store,
                   const int64_t rows,
@@ -918,8 +970,6 @@ struct SkipLoadAndStoreResidual {
   int64_t row_size;
 };
 
-#endif
-
 }  // namespace
 
 template <typename T, typename Context>
@@ -940,9 +990,6 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                           DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* variance) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
-#else
   using U = phi::funcs::LayerNormParamType<T>;
   const T* x_data = x.data<T>();
   const U* norm_weight_data =
@@ -1059,7 +1106,6 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                                                             variance_data);
     }
   }
-#endif
 }
 
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
index bed1535d6fa1d1..c3c9ece6676cbb 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
@@ -24,6 +24,12 @@ namespace fusion {
 
 #define LN_NUM_COLS 1024
 
+#ifdef PADDLE_WITH_HIP
+#define WARPSIZE 64
+#else
+#define WARPSIZE 32
+#endif
+
 template <typename T>
 using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
 template <typename T>
@@ -137,9 +143,9 @@ __global__ void FusedLayernormResidualDropoutBias(
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
 
   T factor =
@@ -147,8 +153,13 @@ __global__ void FusedLayernormResidualDropoutBias(
 
   __shared__ U mean_share;
   __shared__ U var_share;
+#ifdef PADDLE_WITH_HIP
+  __shared__ U shared_mean[64];
+  __shared__ U shared_var[64];
+#else
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
+#endif
 
   phi::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
@@ -331,16 +342,21 @@ __global__ void FusedLayernormResidualDropoutBiasInfer(
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
+  GPURAND(StatePhilox4_32_10_t) state;
+  GPURAND(_init)(seed, idx, increment, &state);
 
   T factor =
       phi::fusion::GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
   __shared__ U mean_share;
   __shared__ U var_share;
+#ifdef PADDLE_WITH_HIP
+  __shared__ U shared_mean[64];
+  __shared__ U shared_var[64];
+#else
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
+#endif
 
   phi::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
@@ -421,7 +437,7 @@ struct FusedLayernormResidualDropoutBiasFunctor {
       T *layernorm_dst,
       LayerNormParamType<T> *mean,
       LayerNormParamType<T> *var,
-      cudaStream_t stream) {
+      GPU(Stream_t) stream) {
     int blockDim = phi::funcs::GetDesiredBlockDim(cols / VecSize);
     if (mean != nullptr && var != nullptr) {
       LaunchFusedLayernormResidualDropoutBiasCUDAKernel<T,
@@ -512,7 +528,7 @@ template <bool HasDropout,
           int WARPS_N = 1,
           int BYTES_PER_LDG = 16,
           int ELTS_PER_ROW = 1024,
-          int THREADS_PER_WARP = 32,
+          int THREADS_PER_WARP = WARPSIZE,
           int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
           int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
           int ROWS_PER_CTA = WARPS_M,
@@ -565,9 +581,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
   const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
 
   int idx = r * ELTS_PER_ROW + c;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
 
   T factor =
@@ -620,7 +636,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
         RandVec<VecSize>(&state, rand);
 #pragma unroll
         for (int jt = 0; jt < VecSize; jt++) {
+#ifndef PADDLE_WITH_HIP
 #pragma unroll
+#endif
           mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
         }
       }
@@ -708,7 +726,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      mu_local += __shfl_xor(mu_local, it);
+#else
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -743,7 +765,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      var_local += __shfl_xor(var_local, it);
+#else
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -867,7 +893,7 @@ void LaunchLayernormResidualDropoutBias(
                             rows * cols * sizeof(T),
                             ctx.stream());
     if (mask_data != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
+      PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemsetAsync)(
           mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
     }
     // call layernorm forward
@@ -896,7 +922,7 @@ void LaunchLayernormResidualDropoutBias(
   case (cols): {                                                               \
     constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                   \
     constexpr int WARPS_M = 4 / WARPS_N;                                       \
-    const int THREADS_PER_WARP = 32;                                           \
+    const int THREADS_PER_WARP = WARPSIZE;                                     \
     const int BYTES_PER_LDG = 16;                                              \
     const int VecSize = BYTES_PER_LDG / sizeof(T);                             \
     const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;          \
diff --git a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
index 4995360811b389..8cd4902ec59c3a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
@@ -41,7 +41,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id,
     const int col_id,
     const int cols,
-    curandStatePhilox4_32_10_t *state,
+    GPURAND(StatePhilox4_32_10_t) * state,
     const float dropout_prob,
     const T factor,
     const InType *__restrict__ src,
@@ -281,9 +281,9 @@ __global__ void FusedResidualDropoutBias(
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
   T factor;
   if (HasDropout) {
diff --git a/paddle/phi/kernels/gpu/rms_norm_funcs.h b/paddle/phi/kernels/gpu/rms_norm_funcs.h
index 2bf035d30e1dc1..82586aacd130f3 100644
--- a/paddle/phi/kernels/gpu/rms_norm_funcs.h
+++ b/paddle/phi/kernels/gpu/rms_norm_funcs.h
@@ -21,22 +21,29 @@ limitations under the License. */
 #pragma once
 
 #include <assert.h>
-#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#define GPU(str) hip##str
+#else
 #include <cuda.h>          // NOLINT
 #include <cuda_runtime.h>  // NOLINT
 #include <cub/cub.cuh>
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#define GPU(str) cuda##str
 #endif
 
 namespace phi {
 
 namespace {  // NOLINT
-#ifndef PADDLE_WITH_HIP
 
 #define DEFAULT_THROW(NAME, TYPE)                              \
   default:                                                     \
@@ -78,14 +85,22 @@ namespace {  // NOLINT
     }                                                                      \
   } while (0)
 
+#ifdef PADDLE_WITH_HIP
+#define WARP_SIZE 64
+#else
 #define WARP_SIZE 32
+#endif
 
 template <typename T>
 __device__ __forceinline__ T WARP_SHFL_XOR(T value,
                                            int laneMask,
                                            int width = WARP_SIZE,
                                            unsigned int mask = 0xffffffff) {
+#ifdef PADDLE_WITH_HIP
+  return __shfl_xor(value, laneMask, width);
+#else
   return __shfl_xor_sync(mask, value, laneMask, width);
+#endif
 }
 
 template <typename T>
@@ -93,7 +108,11 @@ __device__ __forceinline__ T WARP_SHFL(T value,
                                        int srcLane,
                                        int width = WARP_SIZE,
                                        unsigned int mask = 0xffffffff) {
+#ifdef PADDLE_WITH_HIP
+  return __shfl(value, srcLane, width);
+#else
   return __shfl_sync(mask, value, srcLane, width);
+#endif
 }
 
 template <typename U>
@@ -296,11 +315,21 @@ __device__ void cuWelfordMuSigma2(const phi::dtype::float16* __restrict__ vals,
       for (int k = 0; k < 8; k += 2) {
         float2 curr = __half22float2(*((__half2*)(lvals + l + k)));  // NOLINT
         if (!rms_only) {
+#ifdef PADDLE_WITH_HIP
+          cuWelfordOnlineSum(static_cast<float>(curr.x), mu, sigma2, count);
+          cuWelfordOnlineSum(static_cast<float>(curr.y), mu, sigma2, count);
+#else
           cuWelfordOnlineSum(curr.x, mu, sigma2, count);
           cuWelfordOnlineSum(curr.y, mu, sigma2, count);
+#endif
         } else {
+#ifdef PADDLE_WITH_HIP
+          cuRMSOnlineSum(static_cast<float>(curr.x), sigma2);
+          cuRMSOnlineSum(static_cast<float>(curr.y), sigma2);
+#else
           cuRMSOnlineSum(curr.x, sigma2);
           cuRMSOnlineSum(curr.y, sigma2);
+#endif
         }
       }
     }
@@ -907,7 +936,7 @@ __global__ void cuComputeGradInput(const T* __restrict__ dout,
     __syncthreads();
   }
 }
-#endif
+
 }  // namespace
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
index fab312470fe9f7..d66fade233755c 100644
--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
@@ -18,22 +18,29 @@ limitations under the License. */
  *     with minor changes. */
 
 #include <assert.h>
-#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#else
 #include <cuda.h>          // NOLINT
 #include <cuda_runtime.h>  // NOLINT
 #include <cub/cub.cuh>
-#include "paddle/phi/kernels/gpu/rms_norm_funcs.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#define GPU(str) cuda##str
 #endif
+#include "paddle/phi/kernels/gpu/rms_norm_funcs.h"
 
 namespace phi {
 
 namespace {
-#ifndef PADDLE_WITH_HIP
 
 template <typename T, typename U, typename V, typename Context>
 void HostRMSNormGradient(const Context& dev_ctx,
@@ -46,7 +53,7 @@ void HostRMSNormGradient(const Context& dev_ctx,
                          double epsilon,
                          T* grad_input,
                          V* grad_gamma) {
-  cudaStream_t stream = dev_ctx.stream();
+  GPU(Stream_t) stream = dev_ctx.stream();
   if (gamma != NULL) {
     const int part_size = 16;
     const dim3 threads2(32, 4, 1);
@@ -144,7 +151,7 @@ void cuda_rms_norm_gradient(const Context& dev_ctx,
                           grad_x->data<T>(),
                           grad_scale->data<SCALE_TYPE>()));
 }
-#endif
+
 }  // namespace
 
 template <typename T, typename Context>
@@ -161,10 +168,6 @@ void RmsNormGradKernel(const Context& dev_ctx,
                        const float quant_scale,
                        DenseTensor* grad_x,
                        DenseTensor* grad_norm_weight) {
-#if defined(PADDLE_WITH_HIP)
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Please compile with CUDA, ROCM platform isn't support it."));
-#else
   if (bias || residual || norm_bias) {
     PADDLE_THROW(phi::errors::Unimplemented(
         "bias or residual or norm_bias is not supported yet"));
@@ -181,7 +184,6 @@ void RmsNormGradKernel(const Context& dev_ctx,
                                      grad_x,
                                      grad_norm_weight,
                                      begin_norm_axis);
-#endif
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/rms_norm_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
index ec138271f43879..67a63694f83c80 100644
--- a/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
@@ -39,17 +39,30 @@ limitations under the License.
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#define GPUMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin
+#else
 #include <cub/cub.cuh>
+#define GPU(str) cuda##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
+#define GPUMaxSharedMemoryPerBlockOptin cudaDevAttrMaxSharedMemoryPerBlockOptin
 #endif
 
 namespace phi {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
-
+#ifdef PADDLE_WITH_HIP
+constexpr int kWarpSize = 64;
+#else
 constexpr int kWarpSize = 32;
+#endif
 
 template <typename T>
 struct SumOp {
@@ -71,7 +84,11 @@ template <template <typename> class ReductionOp,
 __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
     val = ReductionOp<T>()(
+#ifdef PADDLE_WITH_HIP
+        val, __shfl_xor(val, mask, thread_group_width));
+#else
         val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+#endif
   }
   return val;
 }
@@ -116,35 +133,36 @@ __inline__ __device__ double Rsqrt<double>(double x) {
 }
 
 template <class Func>
-inline cudaError_t GetNumBlocks(Func func,
-                                int32_t block_size,
-                                size_t dynamic_smem_size,
-                                int32_t max_blocks,
-                                int32_t waves,
-                                int* num_blocks) {
+inline GPU(Error_t) GetNumBlocks(Func func,
+                                 int32_t block_size,
+                                 size_t dynamic_smem_size,
+                                 int32_t max_blocks,
+                                 int32_t waves,
+                                 int* num_blocks) {
   int dev;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int sm_count;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int max_active_blocks;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks, func, block_size, dynamic_smem_size);
   }
   *num_blocks = std::max<int>(
       1, std::min<int32_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename T>
@@ -299,9 +317,15 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+#else
     T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
     T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
     T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+#endif
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -311,9 +335,15 @@ __inline__ __device__ void WelfordWarpAllReduce(
     T thread_mean, T thread_m2, T thread_count, T* mean, T* m2, T* count) {
   WelfordWarpReduce<T, thread_group_width>(
       thread_mean, thread_m2, thread_count, mean, m2, count);
+#ifdef PADDLE_WITH_HIP
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+#else
   *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
   *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
   *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+#endif
 }
 
 template <typename T, int thread_group_width = kWarpSize>
@@ -321,7 +351,11 @@ __inline__ __device__ T WarpReduceSum(T x) {
   T result = 0.0f;
 #pragma unroll
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    result += __shfl_xor(x, mask, thread_group_width);
+#else
     result += __shfl_xor_sync(0xffffffff, x, mask, thread_group_width);
+#endif
   }
   return result;
 }
@@ -363,7 +397,11 @@ __inline__ __device__ void WelfordBlockAllReduce(T thread_mean,
       warp_m2 = static_cast<T>(0);
       warp_count = static_cast<T>(0);
     }
+#ifdef PADDLE_WITH_HIP
+    __syncthreads();
+#else
     __syncwarp();
+#endif
     T block_mean = 0;
     T block_m2 = 0;
     T block_count = 0;
@@ -441,61 +479,73 @@ template <typename LOAD,
           typename ComputeType,
           int kPackSize,
           int block_size>
-inline cudaError_t LaunchRmsNormBlockSMemImpl(cudaStream_t stream,
-                                              LOAD load,
-                                              STORE store,
-                                              int smem,
-                                              const int32_t rows,
-                                              const int32_t cols,
-                                              const float epsilon,
-                                              ComputeType col_divisor,
-                                              float* inv_var_data) {
+inline GPU(Error_t) LaunchRmsNormBlockSMemImpl(GPU(Stream_t) stream,
+                                               LOAD load,
+                                               STORE store,
+                                               int smem,
+                                               const int32_t rows,
+                                               const int32_t cols,
+                                               const float epsilon,
+                                               ComputeType col_divisor,
+                                               float* inv_var_data) {
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(
+    GPU(Error_t)
+    err = GetNumBlocks(
         RmsNormBlockSMemImpl<LOAD, STORE, ComputeType, kPackSize, block_size>,
         block_size,
         smem,
         rows,
         waves,
         &grid_dim_x);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   RmsNormBlockSMemImpl<LOAD, STORE, ComputeType, kPackSize, block_size>
       <<<grid_dim_x, block_size, smem, stream>>>(
           load, store, rows, cols, epsilon, col_divisor, inv_var_data);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename Func>
-cudaError_t MaximizeDynamicSharedMemorySize(Func func,
-                                            const int max_smem_size) {
-  cudaFuncAttributes attr{};
+GPU(Error_t)
+MaximizeDynamicSharedMemorySize(Func func, const int max_smem_size) {
+  GPU(FuncAttributes) attr{};
+#ifdef PADDLE_WITH_HIP
+  hipError_t err = hipFuncGetAttributes(&attr, (const void*)func);
+#else
   cudaError_t err = cudaFuncGetAttributes(&attr, func);
-  if (err != cudaSuccess) {
+#endif
+  if (err != GPU(Success)) {
     return err;
   }
   constexpr int reserved_smem = 1024;  // 1K
+#ifdef PADDLE_WITH_HIP
+  return hipFuncSetAttribute(
+      (const void*)func,
+      hipFuncAttributeMaxDynamicSharedMemorySize,
+      max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#else
   return cudaFuncSetAttribute(
       func,
       cudaFuncAttributeMaxDynamicSharedMemorySize,
       max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#endif
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int kPackSize>
-inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int32_t rows,
-    const int32_t cols,
-    const float epsilon,
-    ComputeType col_divisor,
-    bool* success,
-    float* inv_var_data) {
+inline GPU(Error_t)
+    TryDispatchRmsNormBlockSMemImplBlockSize(GPU(Stream_t) stream,
+                                             LOAD load,
+                                             STORE store,
+                                             const int32_t rows,
+                                             const int32_t cols,
+                                             const float epsilon,
+                                             ComputeType col_divisor,
+                                             bool* success,
+                                             float* inv_var_data) {
   constexpr int block_size_conf_1 = 128;
   constexpr int block_size_conf_2 = 256;
   constexpr int block_size_conf_3 = 512;
@@ -503,26 +553,27 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int dev = 0;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
 
   int sm_count = 0;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
 
   static const bool max_smem_configed = [=]() {
     int max_smem_size = 0;
-    cudaError_t err = cudaDeviceGetAttribute(
-        &max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(
+        &max_smem_size, GPUMaxSharedMemoryPerBlockOptin, dev);
+    if (err != GPU(Success)) {
       return false;
     }
 
@@ -533,7 +584,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_1>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -543,7 +594,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_2>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -553,7 +604,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_3>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -563,7 +614,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_4>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
 
@@ -574,7 +625,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_1;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_1,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -583,18 +635,19 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_1>,
         block_size_conf_1,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   if (max_active_blocks_conf_1 <= 0) {
     *success = false;
-    return cudaSuccess;
+    return GPU(Success);
   }
 
   int max_active_blocks_conf_4;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_4,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -603,7 +656,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_4>,
         block_size_conf_4,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -628,7 +681,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_3;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_3,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -637,7 +691,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_3>,
         block_size_conf_3,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -661,7 +715,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_2;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_2,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -670,7 +725,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_2>,
         block_size_conf_2,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -710,15 +765,16 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct TryDispatchRmsNormBlockSMemImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int32_t rows,
-                         const int32_t cols,
-                         const float epsilon,
-                         ComputeType col_divisor,
-                         bool* success,
-                         float* inv_var_data) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int32_t rows,
+             const int32_t cols,
+             const float epsilon,
+             ComputeType col_divisor,
+             bool* success,
+             float* inv_var_data) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return TryDispatchRmsNormBlockSMemImplBlockSize<LOAD,
@@ -765,15 +821,15 @@ struct TryDispatchRmsNormBlockSMemImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t TryDispatchRmsNormBlockSMemImpl(cudaStream_t stream,
-                                                   LOAD load,
-                                                   STORE store,
-                                                   const int32_t rows,
-                                                   const int32_t cols,
-                                                   const float epsilon,
-                                                   ComputeType col_divisor,
-                                                   bool* success,
-                                                   float* inv_var_data) {
+inline GPU(Error_t) TryDispatchRmsNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                    LOAD load,
+                                                    STORE store,
+                                                    const int32_t rows,
+                                                    const int32_t cols,
+                                                    const float epsilon,
+                                                    ComputeType col_divisor,
+                                                    bool* success,
+                                                    float* inv_var_data) {
   return TryDispatchRmsNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
       stream,
       load,
@@ -788,8 +844,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImpl(cudaStream_t stream,
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<!std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchRmsNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchRmsNorm(GPU(Stream_t) stream,
                 LOAD load,
                 STORE store,
                 const int32_t rows,
@@ -799,7 +855,8 @@ DispatchRmsNorm(cudaStream_t stream,
   const ComputeType col_divisor = 1.0f / cols;
   bool dispatch_smem_impl_success;
   {
-    cudaError_t err = TryDispatchRmsNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+    GPU(Error_t)
+    err = TryDispatchRmsNormBlockSMemImpl<LOAD, STORE, ComputeType>(
         stream,
         load,
         store,
@@ -809,11 +866,11 @@ DispatchRmsNorm(cudaStream_t stream,
         col_divisor,
         &dispatch_smem_impl_success,
         inv_var_data);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename SRC, typename DST>
@@ -998,8 +1055,6 @@ struct AffineQuantStore {
   const float quant_min_bound;
 };
 
-#endif
-
 }  // namespace
 
 template <typename T, typename Context>
@@ -1018,9 +1073,6 @@ void RmsNormKernel(const Context& dev_ctx,
                    DenseTensor* out,
                    DenseTensor* residual_out,
                    DenseTensor* inv_var) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
-#else
   using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
 
   const T* x_data = x.data<T>();
@@ -1096,7 +1148,6 @@ void RmsNormKernel(const Context& dev_ctx,
           dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data);
     }
   }
-#endif
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 77b636bbb4ba1c..9187ac909aacc6 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1468,13 +1468,6 @@ PD_REGISTER_KERNEL(conv3d_grad,
                    phi::Conv3DCudnnGradKernel,
                    float,
                    phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(depthwise_conv2d_grad,
-                   GPUDNN,
-                   ALL_LAYOUT,
-                   phi::DepthwiseConvCudnnGradKernel,
-                   float,
-                   phi::dtype::float16) {}
 PD_REGISTER_KERNEL(conv2d_double_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 598bf64a103871..db425d003b66dc 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -23,7 +23,7 @@
     is_compiled_with_rocm,
 )
 
-if is_compiled_with_cuda() and not is_compiled_with_rocm():
+if is_compiled_with_cuda() or is_compiled_with_rocm():
     from paddle.base.core import CUDAGraph as CoreCUDAGraph
 
     def is_cuda_graph_supported():
diff --git a/test/legacy_test/test_fused_layernorm_op.py b/test/legacy_test/test_fused_layernorm_op.py
index cd8edc94c0b1a6..2ab024b3a08b78 100644
--- a/test/legacy_test/test_fused_layernorm_op.py
+++ b/test/legacy_test/test_fused_layernorm_op.py
@@ -103,7 +103,8 @@ def naive_residual_biasadd_layer_norm_int8(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestlayernormOp(unittest.TestCase):
     def setUp(self):
@@ -381,7 +382,8 @@ def test_residual_bias_add_layernorm_int8(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestlayernormStaticOp(unittest.TestCase):
     def setUp(self):
diff --git a/test/legacy_test/test_rms_norm_op.py b/test/legacy_test/test_rms_norm_op.py
index f8ae5769cfaaf6..7c642716600bcf 100644
--- a/test/legacy_test/test_rms_norm_op.py
+++ b/test/legacy_test/test_rms_norm_op.py
@@ -98,7 +98,8 @@ def naive_residual_biasadd_rms_norm_int8(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestRMSNormOp(unittest.TestCase):
     def setUp(self):
@@ -347,7 +348,8 @@ def get_forward_backward(func, seed, dtype):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestRMSNormStaticOp(unittest.TestCase):
     def setUp(self):