From a679c7396a6ca0cdc58d68877c422e65190314e0 Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Wed, 3 Apr 2024 14:57:47 +0000
Subject: [PATCH 1/9] [DCU] fix bugs and surpport some fused ops

---
 paddle/phi/CMakeLists.txt                     | 470 ++++++++++--------
 paddle/phi/core/visit_type.h                  |   2 +-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |  24 +-
 ...dropout_residual_layer_norm_grad_kernel.cu |  23 +-
 ...bias_dropout_residual_layer_norm_kernel.cu |  18 +-
 .../fusion/gpu/fused_dropout_act_bias.h       |   8 +-
 .../kernels/fusion/gpu/fused_dropout_common.h |  39 +-
 .../fusion/gpu/fused_layernorm_kernel.cu      | 306 +++++++-----
 .../fused_layernorm_residual_dropout_bias.h   |  46 +-
 .../fusion/gpu/fused_residual_dropout_bias.h  |   6 +-
 paddle/phi/kernels/gpu/rms_norm_funcs.h       |  37 +-
 .../phi/kernels/gpu/rms_norm_grad_kernel.cu   |  24 +-
 paddle/phi/kernels/gpu/rms_norm_kernel.cu     | 235 +++++----
 test/legacy_test/test_fused_layernorm_op.py   |   6 +-
 test/legacy_test/test_rms_norm_op.py          |   6 +-
 15 files changed, 770 insertions(+), 480 deletions(-)

diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 7325aef2202b59..9e13c1c269222f 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -1,231 +1,311 @@
-configure_file(config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/config.h)
-
-# phi auto cmake utils
-include(phi)
-
-set(common_srcs CACHE INTERNAL "" FORCE)
-set(api_srcs CACHE INTERNAL "" FORCE)
-set(capi_srcs CACHE INTERNAL "" FORCE)
-set(core_srcs CACHE INTERNAL "" FORCE)
-set(backends_srcs CACHE INTERNAL "" FORCE)
-set(kernels_srcs CACHE INTERNAL "" FORCE)
-set(infermeta_srcs CACHE INTERNAL "" FORCE)
-#set(excluded_srcs CACHE INTERNAL "" FORCE)
-
-# paddle experimental common components
-add_subdirectory(common)
-# phi (low level) api headers: include
-# phi (high level) api
-add_subdirectory(api)
-# phi core components
-add_subdirectory(core)
-# phi components of specific backends
-add_subdirectory(backends)
-# phi kernels for diff device
-add_subdirectory(kernels)
-# phi infermeta
-add_subdirectory(infermeta)
-# phi tools
-add_subdirectory(tools)
-# phi capi
-if(WITH_CUSTOM_DEVICE)
-  add_subdirectory(capi)
-endif()
-
-set(PHI_DEPS
-    phi_profiler_proto
-    auto_parallel_proto
-    glog
-    warpctc
-    warprnnt
-    eigen3
-    xxhash
-    cblas
-    utf8proc
-    common)
-
-set(INFERENCE_DEPS phi_profiler_proto auto_parallel_proto)
-
-if(WITH_GPU)
-  list(APPEND PHI_DEPS external_error_proto)
-endif()
-
-if(WITH_ASCEND_CL)
-  list(APPEND PHI_DEPS npu_hccl)
-endif()
+set(kernel_declare_file
+    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp
+    CACHE INTERNAL "declarations.h file")
+set(kernel_declare_file_final
+    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h)
+file(
+  WRITE ${kernel_declare_file}
+  "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n"
+)
+file(APPEND ${kernel_declare_file}
+     "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
+set(kernel_declare_file_prune
+    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.prune
+    CACHE INTERNAL "declarations.h file")
 
-if(WITH_FLASHATTN)
-  list(APPEND PHI_DEPS flashattn)
-endif()
+# phi functors and functions called by kernels
+add_subdirectory(funcs)
 
-if(WITH_XBYAK)
-  list(APPEND PHI_DEPS xbyak)
-endif()
+# kernel autotune
+add_subdirectory(autotune)
 
-if(WITH_MKLDNN)
-  list(APPEND PHI_DEPS mkldnn)
-endif()
+copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 
-if(WITH_GLOO)
-  list(APPEND PHI_DEPS gloo)
-endif()
+file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
+file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
+file(GLOB kernel_primitive_h "primitive/*.h")
 
-if(WITH_CUDNN_FRONTEND)
-  list(APPEND PHI_DEPS cudnn-frontend)
-endif()
+# fusion ops would be included here
+file(
+  GLOB kernel_cu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "gpu/*.cu"
+  "gpu/*.cu.cc"
+  "gpudnn/*.cu"
+  "kps/*.cu"
+  "legacy/kps/*.cu"
+  "legacy/gpu/*.cu"
+  "selected_rows/gpu/*.cu"
+  "sparse/gpu/*.cu"
+  "strings/gpu/*.cu"
+  "fusion/gpu/*.cu")
 
-if(WITH_POCKETFFT)
-  list(APPEND PHI_DEPS pocketfft)
+if(APPLE OR WIN32)
+  list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
 endif()
 
-if(WITH_MKLML)
-  list(APPEND PHI_DEPS pocketfft dynload_mklml)
-  list(APPEND INFERENCE_DEPS dynload_mklml)
+if(NOT WITH_DGC)
+  list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu")
 endif()
 
-if(WITH_XPU)
-  list(APPEND PHI_DEPS xpulib)
-  if(WITH_XPU_PLUGIN)
-    add_subdirectory(kernels/xpu/plugin)
-    list(APPEND PHI_DEPS xpuplugin)
-  endif()
+if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
+  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
+  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
 endif()
 
-if(WITH_DGC)
-  list(APPEND PHI_DEPS dgc)
-endif()
+if(WITH_CUTLASS)
+  execute_process(
+    COMMAND
+      ${PYTHON_EXECUTABLE}
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
+      --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_tmp"
+    RESULT_VARIABLE memory_efficient_attention_gen_res)
 
-set(PHI_SRCS
-    ${common_srcs}
-    ${api_srcs}
-    ${core_srcs}
-    ${backends_srcs}
-    ${kernels_srcs}
-    ${infermeta_srcs}
-    ${capi_srcs})
-
-if(WITH_SHARED_PHI)
-  set(PHI_BUILD_TYPE
-      SHARED
-      CACHE INTERNAL "" FORCE)
-else()
-  set(PHI_BUILD_TYPE
-      STATIC
-      CACHE INTERNAL "" FORCE)
-endif()
+  execute_process(
+    COMMAND
+      ${PYTHON_EXECUTABLE}
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
+      --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_variable_tmp"
+    RESULT_VARIABLE memory_efficient_attention_gen_res)
 
-if(WITH_AVX
-   AND AVX512F_FOUND
-   AND AVX512F_FLAG
-   AND WITH_MKL)
-  set_source_files_properties(
-    kernels/fusion/cpu/self_dp_attention_kernel.cc
-    PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized  -mfma ${AVX512F_FLAG}")
-endif()
+  if(NOT memory_efficient_attention_gen_res EQUAL 0)
+    message(
+      FATAL_ERROR
+        "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
+    )
+  endif()
 
-if(WITH_GPU)
-  set_source_files_properties(
-    backends/gpu/gpu_resources.cc
-    PROPERTIES COMPILE_FLAGS
-               "-DCUDA_REAL_ARCHS=\"${NVCC_FLAGS_EXTRA_real_archs}\"")
-  nv_library(
-    phi ${PHI_BUILD_TYPE}
-    SRCS ${PHI_SRCS}
-    DEPS ${PHI_DEPS})
-
-elseif(WITH_ROCM)
-  hip_library(
-    phi ${PHI_BUILD_TYPE}
-    SRCS ${PHI_SRCS}
-    DEPS ${PHI_DEPS})
-
-elseif(WITH_XPU_KP)
-  xpu_library(
-    phi ${PHI_BUILD_TYPE}
-    SRCS ${PHI_SRCS}
-    DEPS ${PHI_DEPS})
-else()
-  cc_library(
-    phi ${PHI_BUILD_TYPE}
-    SRCS ${PHI_SRCS}
-    DEPS ${PHI_DEPS})
-endif()
+  set(autogen_tmp_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp
+  )
+  set(autogen_variable_tmp_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp
+  )
+  set(autogen_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen
+  )
+  set(autogen_variable_dir
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable
+  )
 
-target_compile_definitions(phi PUBLIC PHI_INNER)
+  file(GLOB generated_files ${autogen_tmp_dir}/*.h ${autogen_tmp_dir}/impl/*.cu)
 
-if(WIN32)
-  target_link_libraries(phi shlwapi.lib)
-endif()
+  file(GLOB variable_generated_files ${autogen_variable_tmp_dir}/*.h
+       ${autogen_variable_tmp_dir}/impl/*.cu)
 
-if(WIN32)
-  if(WITH_SHARED_PHI)
-    set_property(TARGET phi PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    set(PHI_NAME
-        phi.dll
-        CACHE INTERNAL "" FORCE)
+  if(EXISTS ${autogen_dir})
+    foreach(gen_file ${generated_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${autogen_dir}")
   else()
-    set(PHI_NAME
-        phi.lib
-        CACHE INTERNAL "" FORCE)
+    foreach(gen_file ${generated_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${autogen_dir}")
   endif()
-elseif(APPLE)
-  if(WITH_SHARED_PHI)
-    set(PHI_NAME
-        libphi.dylib
-        CACHE INTERNAL "" FORCE)
+
+  if(EXISTS ${autogen_variable_dir})
+    foreach(gen_file ${variable_generated_files})
+      string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file
+                     ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${autogen_variable_dir}")
   else()
-    set(PHI_NAME
-        libphi.a
-        CACHE INTERNAL "" FORCE)
+    foreach(gen_file ${variable_generated_files})
+      string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file
+                     ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${autogen_variable_dir}")
   endif()
-else()
-  if(WITH_SHARED_PHI)
-    set(PHI_NAME
-        libphi.so
-        CACHE INTERNAL "" FORCE)
+
+  file(
+    REMOVE_RECURSE
+    ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp
+  )
+  file(
+    REMOVE_RECURSE
+    ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp
+  )
+
+  execute_process(
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory
+      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp"
+    COMMAND ${PYTHON_EXECUTABLE} generic_mixed_gemm_kernelLauncher.py
+            --cuda_arch "${NVCC_ARCH_BIN}"
+    WORKING_DIRECTORY
+      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm"
+  )
+  set(fpA_intB_gemm_autogen_tmp_dir
+      ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp
+  )
+  set(fpA_intB_gemm_autogen_dir
+      ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen
+  )
+
+  file(GLOB fpA_intB_gemm_autogen_files ${fpA_intB_gemm_autogen_tmp_dir}/*.h
+       ${fpA_intB_gemm_autogen_tmp_dir}/*.cu)
+
+  if(EXISTS ${fpA_intB_gemm_autogen_dir})
+    foreach(gen_file ${fpA_intB_gemm_autogen_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                              "${gen_file}" "${now_file}")
+    endforeach()
+    message("copy if different ${fpA_intB_gemm_autogen_dir}")
   else()
-    set(PHI_NAME
-        libphi.a
-        CACHE INTERNAL "" FORCE)
+    foreach(gen_file ${fpA_intB_gemm_autogen_files})
+      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
+      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
+                              "${now_file}")
+    endforeach()
+    message("copy ${fpA_intB_gemm_autogen_dir}")
   endif()
+
+  file(
+    GLOB cutlass_cu
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "fusion/cutlass/*.cu"
+    "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
+    "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
+    "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu"
+    "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu")
+
+  list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
-set(PHI_LIB
-    "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
-    CACHE FILEPATH "PHI Library" FORCE)
+if(NOT WITH_CUDNN_FRONTEND)
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
+    "fusion/gpu/fused_scale_bias_add_relu_kernel.cu"
+    "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu"
+    "fusion/gpu/fused_dot_product_attention_op.cu"
+    "fusion/gpu/max_pool2d_v2_grad_kernel.cu"
+    "fusion/gpu/max_pool2d_v2_kernel.cu")
+endif()
 
-if(MKL_FOUND AND WITH_ONEMKL)
-  target_include_directories(phi PRIVATE ${MKL_INCLUDE})
+# Note(qili93): remove kernels not supported on DCU yet
+if(WITH_ROCM)
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "gpu/affine_grid_grad_kernel.cu"
+    "gpu/apply_per_channel_scale_kernel.cu"
+    "gpu/cholesky_solve_kernel.cu"
+    "gpu/eigh_kernel.cu"
+    "gpu/eigvalsh_kernel.cu"
+    "gpu/lstsq_kernel.cu"
+    "gpu/lu_kernel.cu"
+    "gpu/matrix_rank_kernel.cu"
+    "gpu/matrix_rank_tol_kernel.cu"
+    "gpu/put_along_axis_grad_kernel.cu"
+    "gpu/put_along_axis_kernel.cu"
+    "gpu/qr_kernel.cu"
+    "gpu/svd_kernel.cu"
+    "gpudnn/mha_cudnn_frontend.cu"
+    "fusion/gpu/block_multi_head_attention_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_kernel.cu"
+    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
 endif()
 
-add_dependencies(phi extern_lapack)
-if(WITH_CUTLASS)
-  add_dependencies(phi cutlass_codegen)
-  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION"
-  )# for memory_efficient_attention.h
+set(cc_search_pattern
+    "*.cc"
+    "cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
+    "selected_rows/*.cc"
+    "selected_rows/cpu/*.cc"
+    "sparse/*.cc"
+    "sparse/cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
+    "strings/*.cc"
+    "strings/cpu/*.cc"
+    "fusion/*.cc"
+    "stride/*.cc"
+    "fusion/cpu/*.cc")
+
+if(WITH_MKLDNN)
+  set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc"
+                        "fusion/onednn/*.cc")
+endif()
+
+if(WITH_CUSTOM_DEVICE)
+  set(cc_search_pattern ${cc_search_pattern} "custom/*.cc")
 endif()
-if(WITH_FLASHATTN)
-  add_dependencies(phi flashattn)
+
+file(
+  GLOB kernel_cc
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  ${cc_search_pattern})
+
+if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
+  list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$")
 endif()
 
-# for inference static library
-if(NOT WITH_SHARED_PHI)
-  get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
-  set(phi_modules ${phi_modules} ${INFERENCE_DEPS} phi)
-  set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
+if(NOT
+   (WITH_AVX
+    AND AVX512F_FOUND
+    AND AVX512F_FLAG
+    AND WITH_MKL))
+  list(REMOVE_ITEM kernel_cc "fusion/cpu/self_dp_attention_kernel.cc")
 endif()
 
-set(phi_extension_header_file
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension.h
-    CACHE INTERNAL "phi/extension.h file")
 file(
-  WRITE ${phi_extension_header_file}
-  "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n"
-)
+  GLOB kernel_xpu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
+  "sparse/xpu/*.cc")
+
+if(WITH_GPU OR WITH_ROCM)
+  collect_srcs(kernels_srcs SRCS ${kernel_cu})
+  kernel_declare("${kernel_cu}")
+endif()
+
+if(WITH_XPU)
+  if(WITH_XPU_KP)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
+         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/
+         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
+    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu")
+    foreach(kernel ${kernel_xpu_kps})
+      get_filename_component(name ${kernel} NAME_WE)
+      file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
+    endforeach()
+    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
+    collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_kps})
 
-file(APPEND ${phi_extension_header_file} "#include \"paddle/phi/config.h\"\n\n")
-# generate inner headers include dir for users
-generate_unify_header(backends EXCLUDES context_pool_utils.h)
-generate_unify_header(core EXCLUDES cuda_stream.h)
-generate_unify_header(infermeta)
-generate_unify_header(kernels SKIP_SUFFIX grad_kernel)
+    foreach(kernel ${kernel_cc})
+      configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${kernel}
+                     ${CMAKE_CURRENT_BINARY_DIR}/${kernel} COPYONLY)
+    endforeach()
+    file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc")
+    collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_cc})
+    set(kernel_cc "")
+
+  endif()
+  collect_srcs(kernels_srcs SRCS ${kernel_xpu})
+  kernel_declare("${kernel_xpu}")
+  kernel_declare("${kernel_xpu_kps}")
+  kernel_declare("${kernel_xpu_cc}")
+endif()
+
+collect_srcs(kernels_srcs SRCS ${kernel_cc})
+kernel_declare("${kernel_cc}")
+
+if(NOT "${KERNEL_LIST}" STREQUAL "")
+  prune_declaration_h()
+endif()
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index ad30da4ddcd6f0..03da0544500920 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -355,7 +355,7 @@ namespace phi {
                  "`");                                                        \
     }                                                                         \
   }()
-#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_XPU)
 #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 6a82875819161b..3eee52efcbebe6 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -166,14 +166,14 @@ __inline__ __device__ double rsqrt_(const double val) {
   return ::rsqrt(val);
 }
 
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) || defined(PADDLE_WITH_HIP)
 template <>
 __inline__ __device__ half rsqrt_(const half val) {
   return hrsqrt(val);
 }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T,
           typename U,
           typename ScaleT = U,
@@ -254,7 +254,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      mu_local += __shfl_xor(mu_local, it);
+#else
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -290,7 +294,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      var_local += __shfl_xor(var_local, it);
+#else
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+#endif
     }
 
     if (WARPS_N > 1) {
@@ -546,7 +554,7 @@ __inline__ __device__ void cuLoadAddStridedInputs(const int64_t i1_block,
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <bool IsFusedDropoutResidualLn,
           bool NeedDDropoutSrcPtr,
           typename T,
@@ -678,16 +686,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_fast_kernel(
 #pragma unroll
       // row reduction among 32 threads.
       for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+        sum_loss1 += __shfl_xor(sum_loss1, it);
+        sum_loss2 += __shfl_xor(sum_loss2, it);
+#else
         sum_loss1 += __shfl_xor_sync(uint32_t(-1), sum_loss1, it);
         sum_loss2 += __shfl_xor_sync(uint32_t(-1), sum_loss2, it);
+#endif
       }
       sum_loss1 *= rn;
       sum_loss2 *= rn;
     } else {
 #pragma unroll
       for (int it = 16; it > 0; it /= 2) {
+#ifdef PADDLE_WITH_HIP
+        sum_loss1 += __shfl_down(sum_loss1, it);
+        sum_loss2 += __shfl_down(sum_loss2, it);
+#else
         sum_loss1 += __shfl_down_sync(uint32_t(-1), sum_loss1, it);
         sum_loss2 += __shfl_down_sync(uint32_t(-1), sum_loss2, it);
+#endif
       }
 
       if (lane == 0) {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
index 60a82cfe7c1980..48819c12a8dc0e 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,7 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
 #include <cuda_fp16.h>
 #include <cub/cub.cuh>
 #endif
@@ -21,9 +26,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
-#endif
 
 namespace phi {
 namespace fusion {
@@ -51,7 +54,6 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
-#ifndef PADDLE_WITH_HIP
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
@@ -114,15 +116,19 @@ void FusedBiasDropoutResidualLnGradKernel(
       d_x_data,
       d_bias_data,
       d_residual_data);
-#else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
-#endif
 }
 
 }  // namespace fusion
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBiasDropoutResidualLnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
 PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -130,3 +136,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad,
                    float,
                    double,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
index 37450d3a4e178b..ca0bcbe7f2466a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,9 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
-#endif
 
 namespace phi {
 namespace fusion {
@@ -42,7 +40,6 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
-#ifndef PADDLE_WITH_HIP
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
@@ -95,14 +92,20 @@ void FusedBiasDropoutResidualLnKernel(
       y_data,
       ln_mean_data,
       ln_var_data);
-#else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "FusedBiasDropoutResidualLnKernel not support for rocm"));
-#endif
 }
 }  // namespace fusion
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedBiasDropoutResidualLnKernel,
+                   float,
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
+}
+#else
 PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -112,3 +115,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm,
                    phi::dtype::float16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::UINT8);
 }
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
index e5f5c9ba50ba45..1db2d0134f80a9 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
@@ -35,7 +35,11 @@ struct GeluFunctor {
 template <typename T>
 struct FastGeluFunctor {
   inline __device__ T operator()(const T x) const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE(0, "FastGelu not surpport for rocm");
+#else
     return phi::GeluFwd<T, true>(x);
+#endif
   }
 };
 
@@ -92,8 +96,8 @@ __global__ void FusedDropoutActBias(
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
 
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
+  GPURAND(StatePhilox4_32_10_t) state;
+  GPURAND(_init)(seed, idx, increment, &state);
 
   const T factor =
       phi::fusion::GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
index 2ef46378b1b9bd..ef9ecbb435fdba 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
@@ -20,10 +20,25 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
 
+#ifdef PADDLE_WITH_HIP
+#define GPU(str) hip##str
+#define GPURAND(str) hiprand##str
+#else
+#define GPU(str) cuda##str
+#define GPURAND(str) curand##str
+#endif
+
 namespace phi {
 namespace fusion {
 
@@ -63,26 +78,29 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
 }
 
 template <int VecSize>
-__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec(GPURAND(StatePhilox4_32_10_t) * state,
                                         float *data);
 
 template <>
-__forceinline__ __device__ void RandVec<1>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<1>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  data[0] = curand_uniform(state);
+  data[0] = GPURAND(_uniform)(state);
 }
 
 template <>
-__forceinline__ __device__ void RandVec<2>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<2>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  data[0] = curand_uniform(state);
-  data[1] = curand_uniform(state);
+  data[0] = GPURAND(_uniform)(state);
+  data[1] = GPURAND(_uniform)(state);
 }
 
 template <>
-__forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<4>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
-  float4 rand4 = curand_uniform4(state);
+  float4 rand4 = GPURAND(_uniform4)(state);
   data[0] = rand4.x;
   data[1] = rand4.y;
   data[2] = rand4.w;
@@ -90,7 +108,8 @@ __forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state,
 }
 
 template <>
-__forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
+__forceinline__ __device__ void RandVec<8>(GPURAND(StatePhilox4_32_10_t) *
+                                               state,
                                            float *data) {
   RandVec<4>(state, data);
   RandVec<4>(state, data + 4);
@@ -99,7 +118,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
 template <typename T>
 inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) {
   PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream()));
+      GPU(MemsetAsync)(ptr, 0, size * sizeof(T), ctx.stream()));
 }
 
 /**
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
index e31b24e7e105e5..221019531a5486 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -38,10 +38,19 @@ limitations under the License.
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
-#include <cub/cub.cuh>
 #include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#else
+#include <cub/cub.cuh>
+#define GPU(str) cuda##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
 #endif
 
 namespace phi {
@@ -50,9 +59,11 @@ namespace fusion {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
-
+#ifdef PADDLE_WITH_HIP
+constexpr int kWarpSize = 64;
+#else
 constexpr int kWarpSize = 32;
+#endif
 
 template <typename T>
 struct SumOp {
@@ -74,7 +85,11 @@ template <template <typename> class ReductionOp,
 __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
     val = ReductionOp<T>()(
+#ifdef PADDLE_WITH_HIP
+        val, __shfl_xor(val, mask, thread_group_width));
+#else
         val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+#endif
   }
   return val;
 }
@@ -97,7 +112,7 @@ __inline__ __device__ T Div(T a, T b);
 
 template <>
 __inline__ __device__ float Div<float>(float a, float b) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
+#if defined(OF_LAYER_NORM_USE_FAST_MATH) || defined(PADDLE_WITH_HIP)
   return __fdividef(a, b);
 #else
   return a / b;
@@ -114,7 +129,7 @@ __inline__ __device__ T Rsqrt(T x);
 
 template <>
 __inline__ __device__ float Rsqrt<float>(float x) {
-#ifdef OF_LAYER_NORM_USE_FAST_MATH
+#if defined(OF_LAYER_NORM_USE_FAST_MATH) || defined(PADDLE_WITH_HIP)
   return __frsqrt_rn(x);
 #else
   return rsqrt(x);
@@ -127,35 +142,36 @@ __inline__ __device__ double Rsqrt<double>(double x) {
 }
 
 template <class Func>
-inline cudaError_t GetNumBlocks(Func func,
-                                int64_t block_size,
-                                size_t dynamic_smem_size,
-                                int64_t max_blocks,
-                                int64_t waves,
-                                int* num_blocks) {
+inline GPU(Error_t) GetNumBlocks(Func func,
+                                 int64_t block_size,
+                                 size_t dynamic_smem_size,
+                                 int64_t max_blocks,
+                                 int64_t waves,
+                                 int* num_blocks) {
   int dev;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int sm_count;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int max_active_blocks;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks, func, block_size, dynamic_smem_size);
   }
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename T>
@@ -279,9 +295,15 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+#else
     T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
     T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
     T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+#endif
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -291,9 +313,15 @@ __inline__ __device__ void WelfordWarpAllReduce(
     T thread_mean, T thread_m2, T thread_count, T* mean, T* m2, T* count) {
   WelfordWarpReduce<T, thread_group_width>(
       thread_mean, thread_m2, thread_count, mean, m2, count);
+#ifdef PADDLE_WITH_HIP
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+#else
   *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
   *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
   *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+#endif
 }
 
 template <typename T, int thread_group_width = kWarpSize>
@@ -301,7 +329,11 @@ __inline__ __device__ T WarpReduceSum(T x) {
   T result = 0.0f;
 #pragma unroll
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    result += __shfl_xor(x, mask, thread_group_width);
+#else
     result += __shfl_xor_sync(0xffffffff, x, mask, thread_group_width);
+#endif
   }
   return result;
 }
@@ -343,7 +375,11 @@ __inline__ __device__ void WelfordBlockAllReduce(T thread_mean,
       warp_m2 = static_cast<T>(0);
       warp_count = static_cast<T>(0);
     }
+#ifdef PADDLE_WITH_HIP
+    __syncthreads();
+#else
     __syncwarp();
+#endif
     T block_mean = 0;
     T block_m2 = 0;
     T block_count = 0;
@@ -429,63 +465,75 @@ template <typename LOAD,
           typename ComputeType,
           int pack_size,
           int block_size>
-inline cudaError_t LaunchLayerNormBlockSMemImpl(cudaStream_t stream,
-                                                LOAD load,
-                                                STORE store,
-                                                int smem,
-                                                const int64_t rows,
-                                                const int64_t cols,
-                                                const double epsilon,
-                                                ComputeType* mean,
-                                                ComputeType* inv_variance,
-                                                ComputeType col_divisor) {
+inline GPU(Error_t) LaunchLayerNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                 LOAD load,
+                                                 STORE store,
+                                                 int smem,
+                                                 const int64_t rows,
+                                                 const int64_t cols,
+                                                 const double epsilon,
+                                                 ComputeType* mean,
+                                                 ComputeType* inv_variance,
+                                                 ComputeType col_divisor) {
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(
+    GPU(Error_t)
+    err = GetNumBlocks(
         LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
         block_size,
         smem,
         rows,
         waves,
         &grid_dim_x);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
       <<<grid_dim_x, block_size, smem, stream>>>(
           load, store, rows, cols, epsilon, mean, inv_variance, col_divisor);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename Func>
-cudaError_t MaximizeDynamicSharedMemorySize(Func func,
-                                            const int max_smem_size) {
-  cudaFuncAttributes attr{};
+GPU(Error_t)
+MaximizeDynamicSharedMemorySize(Func func, const int max_smem_size) {
+  GPU(FuncAttributes) attr{};
+#ifdef PADDLE_WITH_HIP
+  hipError_t err = hipFuncGetAttributes(&attr, (const void*)func);
+#else
   cudaError_t err = cudaFuncGetAttributes(&attr, func);
-  if (err != cudaSuccess) {
+#endif
+  if (err != GPU(Success)) {
     return err;
   }
   constexpr int reserved_smem = 1024;  // 1K
+#ifdef PADDLE_WITH_HIP
+  return hipFuncSetAttribute(
+      (const void*)func,
+      hipFuncAttributeMaxDynamicSharedMemorySize,
+      max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#else
   return cudaFuncSetAttribute(
       func,
       cudaFuncAttributeMaxDynamicSharedMemorySize,
       max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#endif
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int64_t rows,
-    const int64_t cols,
-    const double epsilon,
-    ComputeType* mean,
-    ComputeType* inv_variance,
-    ComputeType col_divisor,
-    bool* success) {
+inline GPU(Error_t)
+    TryDispatchLayerNormBlockSMemImplBlockSize(GPU(Stream_t) stream,
+                                               LOAD load,
+                                               STORE store,
+                                               const int64_t rows,
+                                               const int64_t cols,
+                                               const double epsilon,
+                                               ComputeType* mean,
+                                               ComputeType* inv_variance,
+                                               ComputeType col_divisor,
+                                               bool* success) {
   // Note(Zhengzekang): We choose a fixed blocksize to avoid layernorm diff, by
   // RichardWooSJTU.
 
@@ -493,8 +541,8 @@ inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
 
   int dev = 0;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -520,16 +568,17 @@ inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct TryDispatchLayerNormBlockSMemImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int64_t rows,
-                         const int64_t cols,
-                         const double epsilon,
-                         ComputeType* mean,
-                         ComputeType* inv_variance,
-                         ComputeType col_divisor,
-                         bool* success) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int64_t rows,
+             const int64_t cols,
+             const double epsilon,
+             ComputeType* mean,
+             ComputeType* inv_variance,
+             ComputeType col_divisor,
+             bool* success) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return TryDispatchLayerNormBlockSMemImplBlockSize<LOAD,
@@ -579,16 +628,16 @@ struct TryDispatchLayerNormBlockSMemImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t TryDispatchLayerNormBlockSMemImpl(cudaStream_t stream,
-                                                     LOAD load,
-                                                     STORE store,
-                                                     const int64_t rows,
-                                                     const int64_t cols,
-                                                     const double epsilon,
-                                                     ComputeType* mean,
-                                                     ComputeType* inv_variance,
-                                                     ComputeType col_divisor,
-                                                     bool* success) {
+inline GPU(Error_t) TryDispatchLayerNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                      LOAD load,
+                                                      STORE store,
+                                                      const int64_t rows,
+                                                      const int64_t cols,
+                                                      const double epsilon,
+                                                      ComputeType* mean,
+                                                      ComputeType* inv_variance,
+                                                      ComputeType col_divisor,
+                                                      bool* success) {
   return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
       stream,
       load,
@@ -663,48 +712,51 @@ __global__ void __launch_bounds__(1024)
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
-inline cudaError_t LaunchLayerNormBlockUncachedImpl(cudaStream_t stream,
-                                                    LOAD load,
-                                                    STORE store,
-                                                    const int64_t rows,
-                                                    const int64_t cols,
-                                                    const double epsilon,
-                                                    ComputeType* mean,
-                                                    ComputeType* inv_variance) {
+inline GPU(Error_t)
+    LaunchLayerNormBlockUncachedImpl(GPU(Stream_t) stream,
+                                     LOAD load,
+                                     STORE store,
+                                     const int64_t rows,
+                                     const int64_t cols,
+                                     const double epsilon,
+                                     ComputeType* mean,
+                                     ComputeType* inv_variance) {
   constexpr int block_size = 1024;
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(LayerNormBlockUncachedImpl<LOAD,
-                                                              STORE,
-                                                              ComputeType,
-                                                              pack_size,
-                                                              block_size>,
-                                   block_size,
-                                   0,
-                                   rows,
-                                   waves,
-                                   &grid_dim_x);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GetNumBlocks(LayerNormBlockUncachedImpl<LOAD,
+                                                  STORE,
+                                                  ComputeType,
+                                                  pack_size,
+                                                  block_size>,
+                       block_size,
+                       0,
+                       rows,
+                       waves,
+                       &grid_dim_x);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
       <<<grid_dim_x, block_size, 0, stream>>>(
           load, store, rows, cols, epsilon, mean, inv_variance);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct DispatchLayerNormBlockUncachedImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int64_t rows,
-                         const int64_t cols,
-                         const double epsilon,
-                         ComputeType* mean,
-                         ComputeType* inv_variance) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int64_t rows,
+             const int64_t cols,
+             const double epsilon,
+             ComputeType* mean,
+             ComputeType* inv_variance) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
@@ -721,23 +773,23 @@ struct DispatchLayerNormBlockUncachedImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t DispatchLayerNormBlockUncachedImpl(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int64_t rows,
-    const int64_t cols,
-    const double epsilon,
-    ComputeType* mean,
-    ComputeType* inv_variance) {
+inline GPU(Error_t)
+    DispatchLayerNormBlockUncachedImpl(GPU(Stream_t) stream,
+                                       LOAD load,
+                                       STORE store,
+                                       const int64_t rows,
+                                       const int64_t cols,
+                                       const double epsilon,
+                                       ComputeType* mean,
+                                       ComputeType* inv_variance) {
   return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
       stream, load, store, rows, cols, epsilon, mean, inv_variance);
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<!std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchLayerNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchLayerNorm(GPU(Stream_t) stream,
                   LOAD load,
                   STORE store,
                   const int64_t rows,
@@ -748,19 +800,19 @@ DispatchLayerNorm(cudaStream_t stream,
   const ComputeType col_divisor = 1.0f / cols;
   bool dispatch_smem_impl_success;
   {
-    cudaError_t err =
-        TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
-            stream,
-            load,
-            store,
-            rows,
-            cols,
-            epsilon,
-            mean,
-            inv_variance,
-            col_divisor,
-            &dispatch_smem_impl_success);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+        stream,
+        load,
+        store,
+        rows,
+        cols,
+        epsilon,
+        mean,
+        inv_variance,
+        col_divisor,
+        &dispatch_smem_impl_success);
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -768,13 +820,13 @@ DispatchLayerNorm(cudaStream_t stream,
     return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
         stream, load, store, rows, cols, epsilon, mean, inv_variance);
   }
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchLayerNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchLayerNorm(GPU(Stream_t) stream,
                   LOAD load,
                   STORE store,
                   const int64_t rows,
@@ -918,8 +970,6 @@ struct SkipLoadAndStoreResidual {
   int64_t row_size;
 };
 
-#endif
-
 }  // namespace
 
 template <typename T, typename Context>
@@ -940,9 +990,6 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                           DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* variance) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
-#else
   using U = phi::funcs::LayerNormParamType<T>;
   const T* x_data = x.data<T>();
   const U* norm_weight_data =
@@ -1059,7 +1106,6 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                                                             variance_data);
     }
   }
-#endif
 }
 
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
index bed1535d6fa1d1..c3c9ece6676cbb 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
@@ -24,6 +24,12 @@ namespace fusion {
 
 #define LN_NUM_COLS 1024
 
+#ifdef PADDLE_WITH_HIP
+#define WARPSIZE 64
+#else
+#define WARPSIZE 32
+#endif
+
 template <typename T>
 using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
 template <typename T>
@@ -137,9 +143,9 @@ __global__ void FusedLayernormResidualDropoutBias(
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
 
   T factor =
@@ -147,8 +153,13 @@ __global__ void FusedLayernormResidualDropoutBias(
 
   __shared__ U mean_share;
   __shared__ U var_share;
+#ifdef PADDLE_WITH_HIP
+  __shared__ U shared_mean[64];
+  __shared__ U shared_var[64];
+#else
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
+#endif
 
   phi::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
@@ -331,16 +342,21 @@ __global__ void FusedLayernormResidualDropoutBiasInfer(
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
+  GPURAND(StatePhilox4_32_10_t) state;
+  GPURAND(_init)(seed, idx, increment, &state);
 
   T factor =
       phi::fusion::GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
   __shared__ U mean_share;
   __shared__ U var_share;
+#ifdef PADDLE_WITH_HIP
+  __shared__ U shared_mean[64];
+  __shared__ U shared_var[64];
+#else
   __shared__ U shared_mean[32];
   __shared__ U shared_var[32];
+#endif
 
   phi::funcs::ReluFunctor<T> relu;
   U mean_val = 0;
@@ -421,7 +437,7 @@ struct FusedLayernormResidualDropoutBiasFunctor {
       T *layernorm_dst,
       LayerNormParamType<T> *mean,
       LayerNormParamType<T> *var,
-      cudaStream_t stream) {
+      GPU(Stream_t) stream) {
     int blockDim = phi::funcs::GetDesiredBlockDim(cols / VecSize);
     if (mean != nullptr && var != nullptr) {
       LaunchFusedLayernormResidualDropoutBiasCUDAKernel<T,
@@ -512,7 +528,7 @@ template <bool HasDropout,
           int WARPS_N = 1,
           int BYTES_PER_LDG = 16,
           int ELTS_PER_ROW = 1024,
-          int THREADS_PER_WARP = 32,
+          int THREADS_PER_WARP = WARPSIZE,
           int THREADS_PER_ROW = WARPS_N *THREADS_PER_WARP,
           int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
           int ROWS_PER_CTA = WARPS_M,
@@ -565,9 +581,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
   const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
 
   int idx = r * ELTS_PER_ROW + c;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
 
   T factor =
@@ -620,7 +636,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
         RandVec<VecSize>(&state, rand);
 #pragma unroll
         for (int jt = 0; jt < VecSize; jt++) {
+#ifndef PADDLE_WITH_HIP
 #pragma unroll
+#endif
           mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
         }
       }
@@ -708,7 +726,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      mu_local += __shfl_xor(mu_local, it);
+#else
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -743,7 +765,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
+#ifdef PADDLE_WITH_HIP
+      var_local += __shfl_xor(var_local, it);
+#else
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
+#endif
     }
     if (WARPS_N > 1) {
       if (lane == 0) {
@@ -867,7 +893,7 @@ void LaunchLayernormResidualDropoutBias(
                             rows * cols * sizeof(T),
                             ctx.stream());
     if (mask_data != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
+      PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemsetAsync)(
           mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
     }
     // call layernorm forward
@@ -896,7 +922,7 @@ void LaunchLayernormResidualDropoutBias(
   case (cols): {                                                               \
     constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                   \
     constexpr int WARPS_M = 4 / WARPS_N;                                       \
-    const int THREADS_PER_WARP = 32;                                           \
+    const int THREADS_PER_WARP = WARPSIZE;                                     \
     const int BYTES_PER_LDG = 16;                                              \
     const int VecSize = BYTES_PER_LDG / sizeof(T);                             \
     const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;          \
diff --git a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
index 4995360811b389..8cd4902ec59c3a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
@@ -41,7 +41,7 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id,
     const int col_id,
     const int cols,
-    curandStatePhilox4_32_10_t *state,
+    GPURAND(StatePhilox4_32_10_t) * state,
     const float dropout_prob,
     const T factor,
     const InType *__restrict__ src,
@@ -281,9 +281,9 @@ __global__ void FusedResidualDropoutBias(
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
-  curandStatePhilox4_32_10_t state;
+  GPURAND(StatePhilox4_32_10_t) state;
   if (HasDropout) {
-    curand_init(seed, idx, increment, &state);
+    GPURAND(_init)(seed, idx, increment, &state);
   }
   T factor;
   if (HasDropout) {
diff --git a/paddle/phi/kernels/gpu/rms_norm_funcs.h b/paddle/phi/kernels/gpu/rms_norm_funcs.h
index 2bf035d30e1dc1..82586aacd130f3 100644
--- a/paddle/phi/kernels/gpu/rms_norm_funcs.h
+++ b/paddle/phi/kernels/gpu/rms_norm_funcs.h
@@ -21,22 +21,29 @@ limitations under the License. */
 #pragma once
 
 #include <assert.h>
-#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#define GPU(str) hip##str
+#else
 #include <cuda.h>          // NOLINT
 #include <cuda_runtime.h>  // NOLINT
 #include <cub/cub.cuh>
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#define GPU(str) cuda##str
 #endif
 
 namespace phi {
 
 namespace {  // NOLINT
-#ifndef PADDLE_WITH_HIP
 
 #define DEFAULT_THROW(NAME, TYPE)                              \
   default:                                                     \
@@ -78,14 +85,22 @@ namespace {  // NOLINT
     }                                                                      \
   } while (0)
 
+#ifdef PADDLE_WITH_HIP
+#define WARP_SIZE 64
+#else
 #define WARP_SIZE 32
+#endif
 
 template <typename T>
 __device__ __forceinline__ T WARP_SHFL_XOR(T value,
                                            int laneMask,
                                            int width = WARP_SIZE,
                                            unsigned int mask = 0xffffffff) {
+#ifdef PADDLE_WITH_HIP
+  return __shfl_xor(value, laneMask, width);
+#else
   return __shfl_xor_sync(mask, value, laneMask, width);
+#endif
 }
 
 template <typename T>
@@ -93,7 +108,11 @@ __device__ __forceinline__ T WARP_SHFL(T value,
                                        int srcLane,
                                        int width = WARP_SIZE,
                                        unsigned int mask = 0xffffffff) {
+#ifdef PADDLE_WITH_HIP
+  return __shfl(value, srcLane, width);
+#else
   return __shfl_sync(mask, value, srcLane, width);
+#endif
 }
 
 template <typename U>
@@ -296,11 +315,21 @@ __device__ void cuWelfordMuSigma2(const phi::dtype::float16* __restrict__ vals,
       for (int k = 0; k < 8; k += 2) {
         float2 curr = __half22float2(*((__half2*)(lvals + l + k)));  // NOLINT
         if (!rms_only) {
+#ifdef PADDLE_WITH_HIP
+          cuWelfordOnlineSum(static_cast<float>(curr.x), mu, sigma2, count);
+          cuWelfordOnlineSum(static_cast<float>(curr.y), mu, sigma2, count);
+#else
           cuWelfordOnlineSum(curr.x, mu, sigma2, count);
           cuWelfordOnlineSum(curr.y, mu, sigma2, count);
+#endif
         } else {
+#ifdef PADDLE_WITH_HIP
+          cuRMSOnlineSum(static_cast<float>(curr.x), sigma2);
+          cuRMSOnlineSum(static_cast<float>(curr.y), sigma2);
+#else
           cuRMSOnlineSum(curr.x, sigma2);
           cuRMSOnlineSum(curr.y, sigma2);
+#endif
         }
       }
     }
@@ -907,7 +936,7 @@ __global__ void cuComputeGradInput(const T* __restrict__ dout,
     __syncthreads();
   }
 }
-#endif
+
 }  // namespace
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
index fab312470fe9f7..d66fade233755c 100644
--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
@@ -18,22 +18,29 @@ limitations under the License. */
  *     with minor changes. */
 
 #include <assert.h>
-#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+#include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#else
 #include <cuda.h>          // NOLINT
 #include <cuda_runtime.h>  // NOLINT
 #include <cub/cub.cuh>
-#include "paddle/phi/kernels/gpu/rms_norm_funcs.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#define GPU(str) cuda##str
 #endif
+#include "paddle/phi/kernels/gpu/rms_norm_funcs.h"
 
 namespace phi {
 
 namespace {
-#ifndef PADDLE_WITH_HIP
 
 template <typename T, typename U, typename V, typename Context>
 void HostRMSNormGradient(const Context& dev_ctx,
@@ -46,7 +53,7 @@ void HostRMSNormGradient(const Context& dev_ctx,
                          double epsilon,
                          T* grad_input,
                          V* grad_gamma) {
-  cudaStream_t stream = dev_ctx.stream();
+  GPU(Stream_t) stream = dev_ctx.stream();
   if (gamma != NULL) {
     const int part_size = 16;
     const dim3 threads2(32, 4, 1);
@@ -144,7 +151,7 @@ void cuda_rms_norm_gradient(const Context& dev_ctx,
                           grad_x->data<T>(),
                           grad_scale->data<SCALE_TYPE>()));
 }
-#endif
+
 }  // namespace
 
 template <typename T, typename Context>
@@ -161,10 +168,6 @@ void RmsNormGradKernel(const Context& dev_ctx,
                        const float quant_scale,
                        DenseTensor* grad_x,
                        DenseTensor* grad_norm_weight) {
-#if defined(PADDLE_WITH_HIP)
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Please compile with CUDA, ROCM platform isn't support it."));
-#else
   if (bias || residual || norm_bias) {
     PADDLE_THROW(phi::errors::Unimplemented(
         "bias or residual or norm_bias is not supported yet"));
@@ -181,7 +184,6 @@ void RmsNormGradKernel(const Context& dev_ctx,
                                      grad_x,
                                      grad_norm_weight,
                                      begin_norm_axis);
-#endif
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/rms_norm_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
index ec138271f43879..67a63694f83c80 100644
--- a/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
@@ -39,17 +39,30 @@ limitations under the License.
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#define GPU(str) hip##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#define GPUMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin
+#else
 #include <cub/cub.cuh>
+#define GPU(str) cuda##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
+#define GPUMaxSharedMemoryPerBlockOptin cudaDevAttrMaxSharedMemoryPerBlockOptin
 #endif
 
 namespace phi {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
-
+#ifdef PADDLE_WITH_HIP
+constexpr int kWarpSize = 64;
+#else
 constexpr int kWarpSize = 32;
+#endif
 
 template <typename T>
 struct SumOp {
@@ -71,7 +84,11 @@ template <template <typename> class ReductionOp,
 __inline__ __device__ T WarpAllReduce(T val) {
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
     val = ReductionOp<T>()(
+#ifdef PADDLE_WITH_HIP
+        val, __shfl_xor(val, mask, thread_group_width));
+#else
         val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+#endif
   }
   return val;
 }
@@ -116,35 +133,36 @@ __inline__ __device__ double Rsqrt<double>(double x) {
 }
 
 template <class Func>
-inline cudaError_t GetNumBlocks(Func func,
-                                int32_t block_size,
-                                size_t dynamic_smem_size,
-                                int32_t max_blocks,
-                                int32_t waves,
-                                int* num_blocks) {
+inline GPU(Error_t) GetNumBlocks(Func func,
+                                 int32_t block_size,
+                                 size_t dynamic_smem_size,
+                                 int32_t max_blocks,
+                                 int32_t waves,
+                                 int* num_blocks) {
   int dev;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int sm_count;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
   int max_active_blocks;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks, func, block_size, dynamic_smem_size);
   }
   *num_blocks = std::max<int>(
       1, std::min<int32_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename T>
@@ -299,9 +317,15 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    T b_mean = __shfl_down(*mean, mask, thread_group_width);
+    T b_m2 = __shfl_down(*m2, mask, thread_group_width);
+    T b_count = __shfl_down(*count, mask, thread_group_width);
+#else
     T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
     T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
     T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+#endif
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -311,9 +335,15 @@ __inline__ __device__ void WelfordWarpAllReduce(
     T thread_mean, T thread_m2, T thread_count, T* mean, T* m2, T* count) {
   WelfordWarpReduce<T, thread_group_width>(
       thread_mean, thread_m2, thread_count, mean, m2, count);
+#ifdef PADDLE_WITH_HIP
+  *mean = __shfl(*mean, 0, thread_group_width);
+  *m2 = __shfl(*m2, 0, thread_group_width);
+  *count = __shfl(*count, 0, thread_group_width);
+#else
   *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
   *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
   *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+#endif
 }
 
 template <typename T, int thread_group_width = kWarpSize>
@@ -321,7 +351,11 @@ __inline__ __device__ T WarpReduceSum(T x) {
   T result = 0.0f;
 #pragma unroll
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+#ifdef PADDLE_WITH_HIP
+    result += __shfl_xor(x, mask, thread_group_width);
+#else
     result += __shfl_xor_sync(0xffffffff, x, mask, thread_group_width);
+#endif
   }
   return result;
 }
@@ -363,7 +397,11 @@ __inline__ __device__ void WelfordBlockAllReduce(T thread_mean,
       warp_m2 = static_cast<T>(0);
       warp_count = static_cast<T>(0);
     }
+#ifdef PADDLE_WITH_HIP
+    __syncthreads();
+#else
     __syncwarp();
+#endif
     T block_mean = 0;
     T block_m2 = 0;
     T block_count = 0;
@@ -441,61 +479,73 @@ template <typename LOAD,
           typename ComputeType,
           int kPackSize,
           int block_size>
-inline cudaError_t LaunchRmsNormBlockSMemImpl(cudaStream_t stream,
-                                              LOAD load,
-                                              STORE store,
-                                              int smem,
-                                              const int32_t rows,
-                                              const int32_t cols,
-                                              const float epsilon,
-                                              ComputeType col_divisor,
-                                              float* inv_var_data) {
+inline GPU(Error_t) LaunchRmsNormBlockSMemImpl(GPU(Stream_t) stream,
+                                               LOAD load,
+                                               STORE store,
+                                               int smem,
+                                               const int32_t rows,
+                                               const int32_t cols,
+                                               const float epsilon,
+                                               ComputeType col_divisor,
+                                               float* inv_var_data) {
   constexpr int waves = 32;
   int grid_dim_x;
   {
-    cudaError_t err = GetNumBlocks(
+    GPU(Error_t)
+    err = GetNumBlocks(
         RmsNormBlockSMemImpl<LOAD, STORE, ComputeType, kPackSize, block_size>,
         block_size,
         smem,
         rows,
         waves,
         &grid_dim_x);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   RmsNormBlockSMemImpl<LOAD, STORE, ComputeType, kPackSize, block_size>
       <<<grid_dim_x, block_size, smem, stream>>>(
           load, store, rows, cols, epsilon, col_divisor, inv_var_data);
-  return cudaPeekAtLastError();
+  return GPU(PeekAtLastError)();
 }
 
 template <typename Func>
-cudaError_t MaximizeDynamicSharedMemorySize(Func func,
-                                            const int max_smem_size) {
-  cudaFuncAttributes attr{};
+GPU(Error_t)
+MaximizeDynamicSharedMemorySize(Func func, const int max_smem_size) {
+  GPU(FuncAttributes) attr{};
+#ifdef PADDLE_WITH_HIP
+  hipError_t err = hipFuncGetAttributes(&attr, (const void*)func);
+#else
   cudaError_t err = cudaFuncGetAttributes(&attr, func);
-  if (err != cudaSuccess) {
+#endif
+  if (err != GPU(Success)) {
     return err;
   }
   constexpr int reserved_smem = 1024;  // 1K
+#ifdef PADDLE_WITH_HIP
+  return hipFuncSetAttribute(
+      (const void*)func,
+      hipFuncAttributeMaxDynamicSharedMemorySize,
+      max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#else
   return cudaFuncSetAttribute(
       func,
       cudaFuncAttributeMaxDynamicSharedMemorySize,
       max_smem_size - attr.sharedSizeBytes - reserved_smem);
+#endif
 }
 
 template <typename LOAD, typename STORE, typename ComputeType, int kPackSize>
-inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
-    cudaStream_t stream,
-    LOAD load,
-    STORE store,
-    const int32_t rows,
-    const int32_t cols,
-    const float epsilon,
-    ComputeType col_divisor,
-    bool* success,
-    float* inv_var_data) {
+inline GPU(Error_t)
+    TryDispatchRmsNormBlockSMemImplBlockSize(GPU(Stream_t) stream,
+                                             LOAD load,
+                                             STORE store,
+                                             const int32_t rows,
+                                             const int32_t cols,
+                                             const float epsilon,
+                                             ComputeType col_divisor,
+                                             bool* success,
+                                             float* inv_var_data) {
   constexpr int block_size_conf_1 = 128;
   constexpr int block_size_conf_2 = 256;
   constexpr int block_size_conf_3 = 512;
@@ -503,26 +553,27 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int dev = 0;
   {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t) err = GPU(GetDevice)(&dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
 
   int sm_count = 0;
   {
-    cudaError_t err =
-        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(&sm_count, GPUMultiProcessorCount, dev);
+    if (err != GPU(Success)) {
       return err;
     }
   }
 
   static const bool max_smem_configed = [=]() {
     int max_smem_size = 0;
-    cudaError_t err = cudaDeviceGetAttribute(
-        &max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-    if (err != cudaSuccess) {
+    GPU(Error_t)
+    err = GPU(DeviceGetAttribute)(
+        &max_smem_size, GPUMaxSharedMemoryPerBlockOptin, dev);
+    if (err != GPU(Success)) {
       return false;
     }
 
@@ -533,7 +584,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_1>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -543,7 +594,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_2>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -553,7 +604,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_3>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
     err =
@@ -563,7 +614,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                                                              kPackSize,
                                                              block_size_conf_4>,
                                         max_smem_size);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return false;
     }
 
@@ -574,7 +625,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_1;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_1,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -583,18 +635,19 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_1>,
         block_size_conf_1,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
   if (max_active_blocks_conf_1 <= 0) {
     *success = false;
-    return cudaSuccess;
+    return GPU(Success);
   }
 
   int max_active_blocks_conf_4;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_4,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -603,7 +656,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_4>,
         block_size_conf_4,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -628,7 +681,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_3;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_3,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -637,7 +691,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_3>,
         block_size_conf_3,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -661,7 +715,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
   int max_active_blocks_conf_2;
   {
-    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    GPU(Error_t)
+    err = GPU(OccupancyMaxActiveBlocksPerMultiprocessor)(
         &max_active_blocks_conf_2,
         RmsNormBlockSMemImpl<LOAD,
                              STORE,
@@ -670,7 +725,7 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
                              block_size_conf_2>,
         block_size_conf_2,
         smem);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
@@ -710,15 +765,16 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImplBlockSize(
 
 template <typename LOAD, typename STORE, typename ComputeType>
 struct TryDispatchRmsNormBlockSMemImplPackSize {
-  cudaError_t operator()(cudaStream_t stream,
-                         LOAD load,
-                         STORE store,
-                         const int32_t rows,
-                         const int32_t cols,
-                         const float epsilon,
-                         ComputeType col_divisor,
-                         bool* success,
-                         float* inv_var_data) {
+  GPU(Error_t)
+  operator()(GPU(Stream_t) stream,
+             LOAD load,
+             STORE store,
+             const int32_t rows,
+             const int32_t cols,
+             const float epsilon,
+             ComputeType col_divisor,
+             bool* success,
+             float* inv_var_data) {
     if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
         CanPackAs<STORE>(store, 4)) {
       return TryDispatchRmsNormBlockSMemImplBlockSize<LOAD,
@@ -765,15 +821,15 @@ struct TryDispatchRmsNormBlockSMemImplPackSize {
 };
 
 template <typename LOAD, typename STORE, typename ComputeType>
-inline cudaError_t TryDispatchRmsNormBlockSMemImpl(cudaStream_t stream,
-                                                   LOAD load,
-                                                   STORE store,
-                                                   const int32_t rows,
-                                                   const int32_t cols,
-                                                   const float epsilon,
-                                                   ComputeType col_divisor,
-                                                   bool* success,
-                                                   float* inv_var_data) {
+inline GPU(Error_t) TryDispatchRmsNormBlockSMemImpl(GPU(Stream_t) stream,
+                                                    LOAD load,
+                                                    STORE store,
+                                                    const int32_t rows,
+                                                    const int32_t cols,
+                                                    const float epsilon,
+                                                    ComputeType col_divisor,
+                                                    bool* success,
+                                                    float* inv_var_data) {
   return TryDispatchRmsNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
       stream,
       load,
@@ -788,8 +844,8 @@ inline cudaError_t TryDispatchRmsNormBlockSMemImpl(cudaStream_t stream,
 
 template <typename LOAD, typename STORE, typename ComputeType>
 inline typename std::enable_if<!std::is_same<ComputeType, double>::value,
-                               cudaError_t>::type
-DispatchRmsNorm(cudaStream_t stream,
+                               GPU(Error_t)>::type
+DispatchRmsNorm(GPU(Stream_t) stream,
                 LOAD load,
                 STORE store,
                 const int32_t rows,
@@ -799,7 +855,8 @@ DispatchRmsNorm(cudaStream_t stream,
   const ComputeType col_divisor = 1.0f / cols;
   bool dispatch_smem_impl_success;
   {
-    cudaError_t err = TryDispatchRmsNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+    GPU(Error_t)
+    err = TryDispatchRmsNormBlockSMemImpl<LOAD, STORE, ComputeType>(
         stream,
         load,
         store,
@@ -809,11 +866,11 @@ DispatchRmsNorm(cudaStream_t stream,
         col_divisor,
         &dispatch_smem_impl_success,
         inv_var_data);
-    if (err != cudaSuccess) {
+    if (err != GPU(Success)) {
       return err;
     }
   }
-  return cudaSuccess;
+  return GPU(Success);
 }
 
 template <typename SRC, typename DST>
@@ -998,8 +1055,6 @@ struct AffineQuantStore {
   const float quant_min_bound;
 };
 
-#endif
-
 }  // namespace
 
 template <typename T, typename Context>
@@ -1018,9 +1073,6 @@ void RmsNormKernel(const Context& dev_ctx,
                    DenseTensor* out,
                    DenseTensor* residual_out,
                    DenseTensor* inv_var) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
-#else
   using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
 
   const T* x_data = x.data<T>();
@@ -1096,7 +1148,6 @@ void RmsNormKernel(const Context& dev_ctx,
           dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data);
     }
   }
-#endif
 }
 
 }  // namespace phi
diff --git a/test/legacy_test/test_fused_layernorm_op.py b/test/legacy_test/test_fused_layernorm_op.py
index c564a3c11fbe6d..a8820f5dbc5264 100644
--- a/test/legacy_test/test_fused_layernorm_op.py
+++ b/test/legacy_test/test_fused_layernorm_op.py
@@ -103,7 +103,8 @@ def naive_residual_biasadd_layer_norm_int8(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestlayernormOp(unittest.TestCase):
     def setUp(self):
@@ -381,7 +382,8 @@ def test_residual_bias_add_layernorm_int8(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestlayernormStaticOp(unittest.TestCase):
     def setUp(self):
diff --git a/test/legacy_test/test_rms_norm_op.py b/test/legacy_test/test_rms_norm_op.py
index f8ae5769cfaaf6..7c642716600bcf 100644
--- a/test/legacy_test/test_rms_norm_op.py
+++ b/test/legacy_test/test_rms_norm_op.py
@@ -98,7 +98,8 @@ def naive_residual_biasadd_rms_norm_int8(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestRMSNormOp(unittest.TestCase):
     def setUp(self):
@@ -347,7 +348,8 @@ def get_forward_backward(func, seed, dtype):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA "
+    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA or ROCM",
 )
 class TestRMSNormStaticOp(unittest.TestCase):
     def setUp(self):

From 0631c13a51aa3639edf8146a3bfa242d7fdd5ac4 Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Wed, 3 Apr 2024 15:21:09 +0000
Subject: [PATCH 2/9] [DCU] fix a small bug

---
 paddle/phi/CMakeLists.txt         | 470 +++++++++++++-----------------
 paddle/phi/kernels/CMakeLists.txt |   2 -
 2 files changed, 195 insertions(+), 277 deletions(-)

diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 9e13c1c269222f..7325aef2202b59 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -1,311 +1,231 @@
-set(kernel_declare_file
-    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp
-    CACHE INTERNAL "declarations.h file")
-set(kernel_declare_file_final
-    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h)
-file(
-  WRITE ${kernel_declare_file}
-  "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n"
-)
-file(APPEND ${kernel_declare_file}
-     "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
-set(kernel_declare_file_prune
-    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.prune
-    CACHE INTERNAL "declarations.h file")
-
-# phi functors and functions called by kernels
-add_subdirectory(funcs)
-
-# kernel autotune
-add_subdirectory(autotune)
-
-copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
-
-file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
-file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
-file(GLOB kernel_primitive_h "primitive/*.h")
-
-# fusion ops would be included here
-file(
-  GLOB kernel_cu
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "gpu/*.cu"
-  "gpu/*.cu.cc"
-  "gpudnn/*.cu"
-  "kps/*.cu"
-  "legacy/kps/*.cu"
-  "legacy/gpu/*.cu"
-  "selected_rows/gpu/*.cu"
-  "sparse/gpu/*.cu"
-  "strings/gpu/*.cu"
-  "fusion/gpu/*.cu")
-
-if(APPLE OR WIN32)
-  list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
+configure_file(config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/config.h)
+
+# phi auto cmake utils
+include(phi)
+
+set(common_srcs CACHE INTERNAL "" FORCE)
+set(api_srcs CACHE INTERNAL "" FORCE)
+set(capi_srcs CACHE INTERNAL "" FORCE)
+set(core_srcs CACHE INTERNAL "" FORCE)
+set(backends_srcs CACHE INTERNAL "" FORCE)
+set(kernels_srcs CACHE INTERNAL "" FORCE)
+set(infermeta_srcs CACHE INTERNAL "" FORCE)
+#set(excluded_srcs CACHE INTERNAL "" FORCE)
+
+# paddle experimental common components
+add_subdirectory(common)
+# phi (low level) api headers: include
+# phi (high level) api
+add_subdirectory(api)
+# phi core components
+add_subdirectory(core)
+# phi components of specific backends
+add_subdirectory(backends)
+# phi kernels for diff device
+add_subdirectory(kernels)
+# phi infermeta
+add_subdirectory(infermeta)
+# phi tools
+add_subdirectory(tools)
+# phi capi
+if(WITH_CUSTOM_DEVICE)
+  add_subdirectory(capi)
 endif()
 
-if(NOT WITH_DGC)
-  list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu")
+set(PHI_DEPS
+    phi_profiler_proto
+    auto_parallel_proto
+    glog
+    warpctc
+    warprnnt
+    eigen3
+    xxhash
+    cblas
+    utf8proc
+    common)
+
+set(INFERENCE_DEPS phi_profiler_proto auto_parallel_proto)
+
+if(WITH_GPU)
+  list(APPEND PHI_DEPS external_error_proto)
 endif()
 
-if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
-  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
-  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
+if(WITH_ASCEND_CL)
+  list(APPEND PHI_DEPS npu_hccl)
 endif()
 
-if(WITH_CUTLASS)
-  execute_process(
-    COMMAND
-      ${PYTHON_EXECUTABLE}
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
-      --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_tmp"
-    RESULT_VARIABLE memory_efficient_attention_gen_res)
-
-  execute_process(
-    COMMAND
-      ${PYTHON_EXECUTABLE}
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
-      --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_variable_tmp"
-    RESULT_VARIABLE memory_efficient_attention_gen_res)
-
-  if(NOT memory_efficient_attention_gen_res EQUAL 0)
-    message(
-      FATAL_ERROR
-        "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
-    )
-  endif()
-
-  set(autogen_tmp_dir
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp
-  )
-  set(autogen_variable_tmp_dir
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp
-  )
-  set(autogen_dir
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen
-  )
-  set(autogen_variable_dir
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable
-  )
-
-  file(GLOB generated_files ${autogen_tmp_dir}/*.h ${autogen_tmp_dir}/impl/*.cu)
+if(WITH_FLASHATTN)
+  list(APPEND PHI_DEPS flashattn)
+endif()
 
-  file(GLOB variable_generated_files ${autogen_variable_tmp_dir}/*.h
-       ${autogen_variable_tmp_dir}/impl/*.cu)
+if(WITH_XBYAK)
+  list(APPEND PHI_DEPS xbyak)
+endif()
 
-  if(EXISTS ${autogen_dir})
-    foreach(gen_file ${generated_files})
-      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                              "${gen_file}" "${now_file}")
-    endforeach()
-    message("copy if different ${autogen_dir}")
-  else()
-    foreach(gen_file ${generated_files})
-      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
-                              "${now_file}")
-    endforeach()
-    message("copy ${autogen_dir}")
-  endif()
+if(WITH_MKLDNN)
+  list(APPEND PHI_DEPS mkldnn)
+endif()
 
-  if(EXISTS ${autogen_variable_dir})
-    foreach(gen_file ${variable_generated_files})
-      string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file
-                     ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                              "${gen_file}" "${now_file}")
-    endforeach()
-    message("copy if different ${autogen_variable_dir}")
-  else()
-    foreach(gen_file ${variable_generated_files})
-      string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file
-                     ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
-                              "${now_file}")
-    endforeach()
-    message("copy ${autogen_variable_dir}")
-  endif()
+if(WITH_GLOO)
+  list(APPEND PHI_DEPS gloo)
+endif()
 
-  file(
-    REMOVE_RECURSE
-    ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp
-  )
-  file(
-    REMOVE_RECURSE
-    ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp
-  )
+if(WITH_CUDNN_FRONTEND)
+  list(APPEND PHI_DEPS cudnn-frontend)
+endif()
 
-  execute_process(
-    COMMAND
-      ${CMAKE_COMMAND} -E make_directory
-      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp"
-    COMMAND ${PYTHON_EXECUTABLE} generic_mixed_gemm_kernelLauncher.py
-            --cuda_arch "${NVCC_ARCH_BIN}"
-    WORKING_DIRECTORY
-      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm"
-  )
-  set(fpA_intB_gemm_autogen_tmp_dir
-      ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp
-  )
-  set(fpA_intB_gemm_autogen_dir
-      ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen
-  )
+if(WITH_POCKETFFT)
+  list(APPEND PHI_DEPS pocketfft)
+endif()
 
-  file(GLOB fpA_intB_gemm_autogen_files ${fpA_intB_gemm_autogen_tmp_dir}/*.h
-       ${fpA_intB_gemm_autogen_tmp_dir}/*.cu)
+if(WITH_MKLML)
+  list(APPEND PHI_DEPS pocketfft dynload_mklml)
+  list(APPEND INFERENCE_DEPS dynload_mklml)
+endif()
 
-  if(EXISTS ${fpA_intB_gemm_autogen_dir})
-    foreach(gen_file ${fpA_intB_gemm_autogen_files})
-      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                              "${gen_file}" "${now_file}")
-    endforeach()
-    message("copy if different ${fpA_intB_gemm_autogen_dir}")
-  else()
-    foreach(gen_file ${fpA_intB_gemm_autogen_files})
-      string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
-                              "${now_file}")
-    endforeach()
-    message("copy ${fpA_intB_gemm_autogen_dir}")
+if(WITH_XPU)
+  list(APPEND PHI_DEPS xpulib)
+  if(WITH_XPU_PLUGIN)
+    add_subdirectory(kernels/xpu/plugin)
+    list(APPEND PHI_DEPS xpuplugin)
   endif()
-
-  file(
-    GLOB cutlass_cu
-    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-    "fusion/cutlass/*.cu"
-    "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
-    "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
-    "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu"
-    "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu")
-
-  list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
-if(NOT WITH_CUDNN_FRONTEND)
-  list(
-    REMOVE_ITEM
-    kernel_cu
-    "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
-    "fusion/gpu/fused_scale_bias_add_relu_kernel.cu"
-    "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu"
-    "fusion/gpu/fused_dot_product_attention_op.cu"
-    "fusion/gpu/max_pool2d_v2_grad_kernel.cu"
-    "fusion/gpu/max_pool2d_v2_kernel.cu")
+if(WITH_DGC)
+  list(APPEND PHI_DEPS dgc)
 endif()
 
-# Note(qili93): remove kernels not supported on DCU yet
-if(WITH_ROCM)
-  list(
-    REMOVE_ITEM
-    kernel_cu
-    "gpu/affine_grid_grad_kernel.cu"
-    "gpu/apply_per_channel_scale_kernel.cu"
-    "gpu/cholesky_solve_kernel.cu"
-    "gpu/eigh_kernel.cu"
-    "gpu/eigvalsh_kernel.cu"
-    "gpu/lstsq_kernel.cu"
-    "gpu/lu_kernel.cu"
-    "gpu/matrix_rank_kernel.cu"
-    "gpu/matrix_rank_tol_kernel.cu"
-    "gpu/put_along_axis_grad_kernel.cu"
-    "gpu/put_along_axis_kernel.cu"
-    "gpu/qr_kernel.cu"
-    "gpu/svd_kernel.cu"
-    "gpudnn/mha_cudnn_frontend.cu"
-    "fusion/gpu/block_multi_head_attention_kernel.cu"
-    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
-    "fusion/gpu/fused_bn_add_activation_kernel.cu"
-    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
+set(PHI_SRCS
+    ${common_srcs}
+    ${api_srcs}
+    ${core_srcs}
+    ${backends_srcs}
+    ${kernels_srcs}
+    ${infermeta_srcs}
+    ${capi_srcs})
+
+if(WITH_SHARED_PHI)
+  set(PHI_BUILD_TYPE
+      SHARED
+      CACHE INTERNAL "" FORCE)
+else()
+  set(PHI_BUILD_TYPE
+      STATIC
+      CACHE INTERNAL "" FORCE)
 endif()
 
-set(cc_search_pattern
-    "*.cc"
-    "cpu/*.cc"
-    "legacy/*.cc"
-    "legacy/cpu/*.cc"
-    "selected_rows/*.cc"
-    "selected_rows/cpu/*.cc"
-    "sparse/*.cc"
-    "sparse/cpu/*.cc"
-    "legacy/*.cc"
-    "legacy/cpu/*.cc"
-    "strings/*.cc"
-    "strings/cpu/*.cc"
-    "fusion/*.cc"
-    "stride/*.cc"
-    "fusion/cpu/*.cc")
-
-if(WITH_MKLDNN)
-  set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc"
-                        "fusion/onednn/*.cc")
+if(WITH_AVX
+   AND AVX512F_FOUND
+   AND AVX512F_FLAG
+   AND WITH_MKL)
+  set_source_files_properties(
+    kernels/fusion/cpu/self_dp_attention_kernel.cc
+    PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized  -mfma ${AVX512F_FLAG}")
 endif()
 
-if(WITH_CUSTOM_DEVICE)
-  set(cc_search_pattern ${cc_search_pattern} "custom/*.cc")
+if(WITH_GPU)
+  set_source_files_properties(
+    backends/gpu/gpu_resources.cc
+    PROPERTIES COMPILE_FLAGS
+               "-DCUDA_REAL_ARCHS=\"${NVCC_FLAGS_EXTRA_real_archs}\"")
+  nv_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+
+elseif(WITH_ROCM)
+  hip_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+
+elseif(WITH_XPU_KP)
+  xpu_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+else()
+  cc_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
 endif()
 
-file(
-  GLOB kernel_cc
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  ${cc_search_pattern})
+target_compile_definitions(phi PUBLIC PHI_INNER)
 
-if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
-  list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$")
+if(WIN32)
+  target_link_libraries(phi shlwapi.lib)
 endif()
 
-if(NOT
-   (WITH_AVX
-    AND AVX512F_FOUND
-    AND AVX512F_FLAG
-    AND WITH_MKL))
-  list(REMOVE_ITEM kernel_cc "fusion/cpu/self_dp_attention_kernel.cc")
+if(WIN32)
+  if(WITH_SHARED_PHI)
+    set_property(TARGET phi PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    set(PHI_NAME
+        phi.dll
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        phi.lib
+        CACHE INTERNAL "" FORCE)
+  endif()
+elseif(APPLE)
+  if(WITH_SHARED_PHI)
+    set(PHI_NAME
+        libphi.dylib
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
+else()
+  if(WITH_SHARED_PHI)
+    set(PHI_NAME
+        libphi.so
+        CACHE INTERNAL "" FORCE)
+  else()
+    set(PHI_NAME
+        libphi.a
+        CACHE INTERNAL "" FORCE)
+  endif()
 endif()
 
-file(
-  GLOB kernel_xpu
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
-  "sparse/xpu/*.cc")
+set(PHI_LIB
+    "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
+    CACHE FILEPATH "PHI Library" FORCE)
 
-if(WITH_GPU OR WITH_ROCM)
-  collect_srcs(kernels_srcs SRCS ${kernel_cu})
-  kernel_declare("${kernel_cu}")
+if(MKL_FOUND AND WITH_ONEMKL)
+  target_include_directories(phi PRIVATE ${MKL_INCLUDE})
 endif()
 
-if(WITH_XPU)
-  if(WITH_XPU_KP)
-    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
-         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
-    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/
-         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
-    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu")
-    foreach(kernel ${kernel_xpu_kps})
-      get_filename_component(name ${kernel} NAME_WE)
-      file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
-    endforeach()
-    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
-    collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_kps})
-
-    foreach(kernel ${kernel_cc})
-      configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${kernel}
-                     ${CMAKE_CURRENT_BINARY_DIR}/${kernel} COPYONLY)
-    endforeach()
-    file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc")
-    collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_cc})
-    set(kernel_cc "")
+add_dependencies(phi extern_lapack)
+if(WITH_CUTLASS)
+  add_dependencies(phi cutlass_codegen)
+  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION"
+  )# for memory_efficient_attention.h
+endif()
+if(WITH_FLASHATTN)
+  add_dependencies(phi flashattn)
+endif()
 
-  endif()
-  collect_srcs(kernels_srcs SRCS ${kernel_xpu})
-  kernel_declare("${kernel_xpu}")
-  kernel_declare("${kernel_xpu_kps}")
-  kernel_declare("${kernel_xpu_cc}")
+# for inference static library
+if(NOT WITH_SHARED_PHI)
+  get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
+  set(phi_modules ${phi_modules} ${INFERENCE_DEPS} phi)
+  set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
 endif()
 
-collect_srcs(kernels_srcs SRCS ${kernel_cc})
-kernel_declare("${kernel_cc}")
+set(phi_extension_header_file
+    ${CMAKE_CURRENT_SOURCE_DIR}/extension.h
+    CACHE INTERNAL "phi/extension.h file")
+file(
+  WRITE ${phi_extension_header_file}
+  "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n"
+)
 
-if(NOT "${KERNEL_LIST}" STREQUAL "")
-  prune_declaration_h()
-endif()
+file(APPEND ${phi_extension_header_file} "#include \"paddle/phi/config.h\"\n\n")
+# generate inner headers include dir for users
+generate_unify_header(backends EXCLUDES context_pool_utils.h)
+generate_unify_header(core EXCLUDES cuda_stream.h)
+generate_unify_header(infermeta)
+generate_unify_header(kernels SKIP_SUFFIX grad_kernel)
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 304fd3cef793a2..9e13c1c269222f 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -209,11 +209,9 @@ if(WITH_ROCM)
     "gpu/lu_kernel.cu"
     "gpu/matrix_rank_kernel.cu"
     "gpu/matrix_rank_tol_kernel.cu"
-    "gpu/multiclass_nms3_kernel.cu"
     "gpu/put_along_axis_grad_kernel.cu"
     "gpu/put_along_axis_kernel.cu"
     "gpu/qr_kernel.cu"
-    "gpu/rms_norm_grad_kernel.cu"
     "gpu/svd_kernel.cu"
     "gpudnn/mha_cudnn_frontend.cu"
     "fusion/gpu/block_multi_head_attention_kernel.cu"

From e925661e47939375350cf2091866b5c0693076c6 Mon Sep 17 00:00:00 2001
From: yuguo <948529990@qq.com>
Date: Sun, 7 Apr 2024 12:09:01 +0800
Subject: [PATCH 3/9] Update fused_dropout_act_bias.h

---
 paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
index 1db2d0134f80a9..0c9b879df1fdd4 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
@@ -36,7 +36,7 @@ template <typename T>
 struct FastGeluFunctor {
   inline __device__ T operator()(const T x) const {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE(0, "FastGelu not surpport for rocm");
+    PADDLE_THROW(phi::errors::Unimplemented("ROCM does not support FastGelu"));
 #else
     return phi::GeluFwd<T, true>(x);
 #endif

From 55aea8f215043766ab4dd2bbf49d3632616cf71b Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Tue, 9 Apr 2024 16:52:57 +0800
Subject: [PATCH 4/9] update fused_dropout_act_bias.h

---
 paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
index 0c9b879df1fdd4..d2cd2f1b545a7c 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
@@ -36,7 +36,7 @@ template <typename T>
 struct FastGeluFunctor {
   inline __device__ T operator()(const T x) const {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_THROW(phi::errors::Unimplemented("ROCM does not support FastGelu"));
+    assert(0 && "ROCM does not support FastGelu");
 #else
     return phi::GeluFwd<T, true>(x);
 #endif

From 42aa9bf873078867a476ef03e0dc08eb68602355 Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Tue, 9 Apr 2024 20:34:14 +0800
Subject: [PATCH 5/9] fix depthwise conv grad op bug

---
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 77b636bbb4ba1c..b5cfd0b08d2454 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1468,13 +1468,7 @@ PD_REGISTER_KERNEL(conv3d_grad,
                    phi::Conv3DCudnnGradKernel,
                    float,
                    phi::dtype::float16) {}
-
-PD_REGISTER_KERNEL(depthwise_conv2d_grad,
-                   GPUDNN,
-                   ALL_LAYOUT,
-                   phi::DepthwiseConvCudnnGradKernel,
-                   float,
-                   phi::dtype::float16) {}
+                   
 PD_REGISTER_KERNEL(conv2d_double_grad,
                    GPUDNN,
                    ALL_LAYOUT,

From 490a0d300554a64a8d1eb3246585e40f82dbafbc Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Wed, 10 Apr 2024 16:39:16 +0800
Subject: [PATCH 6/9] fix hip graph test bugs

---
 paddle/fluid/platform/device/gpu/gpu_info.cc | 1 +
 python/paddle/device/cuda/graphs.py          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 36189cc7e4c90d..73704b04cf90b2 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -217,6 +217,7 @@ class RecordedGpuMallocHelper {
     CUDADeviceGuard guard(dev_id_);
     gpuError_t result;
 #ifdef PADDLE_WITH_HIP
+    phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
       result = hipMallocManaged(ptr, size);
     } else {
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 598bf64a103871..db425d003b66dc 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -23,7 +23,7 @@
     is_compiled_with_rocm,
 )
 
-if is_compiled_with_cuda() and not is_compiled_with_rocm():
+if is_compiled_with_cuda() or is_compiled_with_rocm():
     from paddle.base.core import CUDAGraph as CoreCUDAGraph
 
     def is_cuda_graph_supported():

From 03e9c0aa5c5601c202f84313444d7f6de7447cb6 Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Wed, 10 Apr 2024 19:23:32 +0800
Subject: [PATCH 7/9] update

---
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index b5cfd0b08d2454..9187ac909aacc6 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -1468,7 +1468,6 @@ PD_REGISTER_KERNEL(conv3d_grad,
                    phi::Conv3DCudnnGradKernel,
                    float,
                    phi::dtype::float16) {}
-                   
 PD_REGISTER_KERNEL(conv2d_double_grad,
                    GPUDNN,
                    ALL_LAYOUT,

From ea704f37f17eae9afe2853ad526023ae77192d60 Mon Sep 17 00:00:00 2001
From: yuguo-Jack <948529990@qq.com>
Date: Thu, 11 Apr 2024 12:33:30 +0800
Subject: [PATCH 8/9] fix hip graph dropout bug

---
 paddle/phi/kernels/funcs/dropout_impl.cu.h    | 22 ++++++-------------
 .../gpu/fused_dropout_add_grad_kernel.cu      | 18 +++++----------
 .../fusion/gpu/fused_dropout_add_kernel.cu    | 18 +++++----------
 3 files changed, 17 insertions(+), 41 deletions(-)

diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 463272a37c00d3..150ee570c7122b 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -349,19 +349,6 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-#ifdef PADDLE_WITH_HIP
-      VectorizedRandomGenerator<T>
-          <<<grid_size, block_size, 0, stream>>>(0,
-                                                 size,
-                                                 seed_data,
-                                                 dropout_prob,
-                                                 x_data,
-                                                 mask_data,
-                                                 y_data,
-                                                 upscale_in_train,
-                                                 increment,
-                                                 main_offset);
-#else
       const phi::GPUContext* dev_ctx_p = &dev_ctx;
       auto gen_cuda = dev_ctx.GetGenerator();
       auto state_index = gen_cuda->GetStateIndex();
@@ -372,8 +359,9 @@ void DropoutFwGPUKernelDriver(
             if (!is_fix_seed) {
               // we assume seed is null pointer
               // seed copy to cpu is meaningless here
+#ifndef PADDLE_WITH_HIP
               assert(seed_tensor_ptr == nullptr);
-
+#endif
               auto gen_cuda = dev_ctx_p->GetGenerator();
               // ensure the generator use correct state index
               gen_cuda->SetStateIndex(state_index);
@@ -393,9 +381,14 @@ void DropoutFwGPUKernelDriver(
           cudaKernelCallback = [=](unsigned int id) {
             void* functionPtr =
                 reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
+#ifdef PADDLE_WITH_HIP
+            hipFunction_t cudaFunc =
+                reinterpret_cast<hipFunction_t>(functionPtr);
+#else
             cudaFunction_t cudaFunc;
             PADDLE_ENFORCE_GPU_SUCCESS(
                 cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
             VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                      << " functionPtr = " << functionPtr;
 
@@ -417,7 +410,6 @@ void DropoutFwGPUKernelDriver(
 
       VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
                << ", increment = " << increment;
-#endif
     }
   } else {
     if (upscale_in_train) {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 801f070251fb2c..439f7dbc33e3ce 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -202,18 +202,6 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
-#ifdef PADDLE_WITH_HIP
-    VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
-        <<<grid_size, block_size, 0, stream>>>(0,
-                                               numel,
-                                               seed_data,  //  idx: 2 need save
-                                               x_grad_data,
-                                               y_grad_data,
-                                               out_grad_data,
-                                               increment,  //  idx: 6 need save
-                                               main_offset,
-                                               functor);
-#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -233,9 +221,14 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>));
+#ifdef PADDLE_WITH_HIP
+          hipFunction_t cudaFunc =
+              reinterpret_cast<hipFunction_t>(functionPtr);
+#else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
           VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                    << " functionPtr = " << functionPtr;
 
@@ -257,7 +250,6 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
-#endif
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index c95c5fbf0ca3de..904db4267fcb6d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -186,18 +186,6 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
-#ifdef PADDLE_WITH_HIP
-    VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
-        <<<grid_size, block_size, 0, stream>>>(0,
-                                               numel,
-                                               seed_data,  // need save
-                                               x_data,
-                                               y_data,
-                                               out_data,
-                                               increment,  // need save
-                                               main_offset,
-                                               dst_functor);
-#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -237,9 +225,14 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
+#ifdef PADDLE_WITH_HIP
+          hipFunction_t cudaFunc =
+              reinterpret_cast<hipFunction_t>(functionPtr);
+#else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaGetFuncBySymbol(&cudaFunc, functionPtr));
+#endif
           VLOG(10) << "[cudaKernelCallback] cudaFunc = " << cudaFunc
                    << " functionPtr = " << functionPtr;
 
@@ -260,7 +253,6 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
-#endif
   } else {
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     MT factor = static_cast<MT>(1.0f - dropout_rate);

From 3e66860ac222c17b255228caaf1df8ac0815145e Mon Sep 17 00:00:00 2001
From: yuguo <948529990@qq.com>
Date: Thu, 11 Apr 2024 14:09:17 +0800
Subject: [PATCH 9/9] code style

---
 paddle/phi/kernels/funcs/dropout_impl.cu.h                    | 4 ++--
 .../phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu   | 3 +--
 paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu     | 3 +--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 150ee570c7122b..855b6fe6c8e15c 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -357,8 +357,8 @@ void DropoutFwGPUKernelDriver(
           parameterSetter = [offset, dev_ctx_p, state_index, is_fix_seed](
                                 phi::backends::gpu::gpuKernelParams& params) {
             if (!is_fix_seed) {
-              // we assume seed is null pointer
-              // seed copy to cpu is meaningless here
+          // we assume seed is null pointer
+          // seed copy to cpu is meaningless here
 #ifndef PADDLE_WITH_HIP
               assert(seed_tensor_ptr == nullptr);
 #endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 439f7dbc33e3ce..8994d521382335 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -222,8 +222,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>));
 #ifdef PADDLE_WITH_HIP
-          hipFunction_t cudaFunc =
-              reinterpret_cast<hipFunction_t>(functionPtr);
+          hipFunction_t cudaFunc = reinterpret_cast<hipFunction_t>(functionPtr);
 #else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 904db4267fcb6d..54ec3604bbee93 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -226,8 +226,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
 #ifdef PADDLE_WITH_HIP
-          hipFunction_t cudaFunc =
-              reinterpret_cast<hipFunction_t>(functionPtr);
+          hipFunction_t cudaFunc = reinterpret_cast<hipFunction_t>(functionPtr);
 #else
           cudaFunction_t cudaFunc;
           PADDLE_ENFORCE_GPU_SUCCESS(