From a679c7396a6ca0cdc58d68877c422e65190314e0 Mon Sep 17 00:00:00 2001 From: yuguo-Jack <948529990@qq.com> Date: Wed, 3 Apr 2024 14:57:47 +0000 Subject: [PATCH 1/9] [DCU] fix bugs and surpport some fused ops --- paddle/phi/CMakeLists.txt | 470 ++++++++++-------- paddle/phi/core/visit_type.h | 2 +- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 24 +- ...dropout_residual_layer_norm_grad_kernel.cu | 23 +- ...bias_dropout_residual_layer_norm_kernel.cu | 18 +- .../fusion/gpu/fused_dropout_act_bias.h | 8 +- .../kernels/fusion/gpu/fused_dropout_common.h | 39 +- .../fusion/gpu/fused_layernorm_kernel.cu | 306 +++++++----- .../fused_layernorm_residual_dropout_bias.h | 46 +- .../fusion/gpu/fused_residual_dropout_bias.h | 6 +- paddle/phi/kernels/gpu/rms_norm_funcs.h | 37 +- .../phi/kernels/gpu/rms_norm_grad_kernel.cu | 24 +- paddle/phi/kernels/gpu/rms_norm_kernel.cu | 235 +++++---- test/legacy_test/test_fused_layernorm_op.py | 6 +- test/legacy_test/test_rms_norm_op.py | 6 +- 15 files changed, 770 insertions(+), 480 deletions(-) diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 7325aef2202b59..9e13c1c269222f 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -1,231 +1,311 @@ -configure_file(config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/config.h) - -# phi auto cmake utils -include(phi) - -set(common_srcs CACHE INTERNAL "" FORCE) -set(api_srcs CACHE INTERNAL "" FORCE) -set(capi_srcs CACHE INTERNAL "" FORCE) -set(core_srcs CACHE INTERNAL "" FORCE) -set(backends_srcs CACHE INTERNAL "" FORCE) -set(kernels_srcs CACHE INTERNAL "" FORCE) -set(infermeta_srcs CACHE INTERNAL "" FORCE) -#set(excluded_srcs CACHE INTERNAL "" FORCE) - -# paddle experimental common components -add_subdirectory(common) -# phi (low level) api headers: include -# phi (high level) api -add_subdirectory(api) -# phi core components -add_subdirectory(core) -# phi components of specific backends -add_subdirectory(backends) -# phi kernels for diff device -add_subdirectory(kernels) -# phi infermeta -add_subdirectory(infermeta) -# phi tools -add_subdirectory(tools) -# phi capi -if(WITH_CUSTOM_DEVICE) - add_subdirectory(capi) -endif() - -set(PHI_DEPS - phi_profiler_proto - auto_parallel_proto - glog - warpctc - warprnnt - eigen3 - xxhash - cblas - utf8proc - common) - -set(INFERENCE_DEPS phi_profiler_proto auto_parallel_proto) - -if(WITH_GPU) - list(APPEND PHI_DEPS external_error_proto) -endif() - -if(WITH_ASCEND_CL) - list(APPEND PHI_DEPS npu_hccl) -endif() +set(kernel_declare_file + ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp + CACHE INTERNAL "declarations.h file") +set(kernel_declare_file_final + ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h) +file( + WRITE ${kernel_declare_file} + "// Generated by the paddle/phi/kernels/CMakeLists.txt. DO NOT EDIT!\n\n#pragma once\n\n" +) +file(APPEND ${kernel_declare_file} + "#include \"paddle/phi/core/kernel_registry.h\"\n\n") +set(kernel_declare_file_prune + ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.prune + CACHE INTERNAL "declarations.h file") -if(WITH_FLASHATTN) - list(APPEND PHI_DEPS flashattn) -endif() +# phi functors and functions called by kernels +add_subdirectory(funcs) -if(WITH_XBYAK) - list(APPEND PHI_DEPS xbyak) -endif() +# kernel autotune +add_subdirectory(autotune) -if(WITH_MKLDNN) - list(APPEND PHI_DEPS mkldnn) -endif() +copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) -if(WITH_GLOO) - list(APPEND PHI_DEPS gloo) -endif() +file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") +file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h") +file(GLOB kernel_primitive_h "primitive/*.h") -if(WITH_CUDNN_FRONTEND) - list(APPEND PHI_DEPS cudnn-frontend) -endif() +# fusion ops would be included here +file( + GLOB kernel_cu + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "gpu/*.cu" + "gpu/*.cu.cc" + "gpudnn/*.cu" + "kps/*.cu" + "legacy/kps/*.cu" + "legacy/gpu/*.cu" + "selected_rows/gpu/*.cu" + "sparse/gpu/*.cu" + "strings/gpu/*.cu" + "fusion/gpu/*.cu") -if(WITH_POCKETFFT) - list(APPEND PHI_DEPS pocketfft) +if(APPLE OR WIN32) + list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu") endif() -if(WITH_MKLML) - list(APPEND PHI_DEPS pocketfft dynload_mklml) - list(APPEND INFERENCE_DEPS dynload_mklml) +if(NOT WITH_DGC) + list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu") endif() -if(WITH_XPU) - list(APPEND PHI_DEPS xpulib) - if(WITH_XPU_PLUGIN) - add_subdirectory(kernels/xpu/plugin) - list(APPEND PHI_DEPS xpuplugin) - endif() +if(DEFINED REDUCE_INFERENCE_LIB_SIZE) + list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$") + list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$") endif() -if(WITH_DGC) - list(APPEND PHI_DEPS dgc) -endif() +if(WITH_CUTLASS) + execute_process( + COMMAND + ${PYTHON_EXECUTABLE} + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py + --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_tmp" + RESULT_VARIABLE memory_efficient_attention_gen_res) -set(PHI_SRCS - ${common_srcs} - ${api_srcs} - ${core_srcs} - ${backends_srcs} - ${kernels_srcs} - ${infermeta_srcs} - ${capi_srcs}) - -if(WITH_SHARED_PHI) - set(PHI_BUILD_TYPE - SHARED - CACHE INTERNAL "" FORCE) -else() - set(PHI_BUILD_TYPE - STATIC - CACHE INTERNAL "" FORCE) -endif() + execute_process( + COMMAND + ${PYTHON_EXECUTABLE} + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py + --cuda_arch "${NVCC_ARCH_BIN}" --gen_dir "autogen_variable_tmp" + RESULT_VARIABLE memory_efficient_attention_gen_res) -if(WITH_AVX - AND AVX512F_FOUND - AND AVX512F_FLAG - AND WITH_MKL) - set_source_files_properties( - kernels/fusion/cpu/self_dp_attention_kernel.cc - PROPERTIES COMPILE_FLAGS "-Wno-maybe-uninitialized -mfma ${AVX512F_FLAG}") -endif() + if(NOT memory_efficient_attention_gen_res EQUAL 0) + message( + FATAL_ERROR + "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}" + ) + endif() -if(WITH_GPU) - set_source_files_properties( - backends/gpu/gpu_resources.cc - PROPERTIES COMPILE_FLAGS - "-DCUDA_REAL_ARCHS=\"${NVCC_FLAGS_EXTRA_real_archs}\"") - nv_library( - phi ${PHI_BUILD_TYPE} - SRCS ${PHI_SRCS} - DEPS ${PHI_DEPS}) - -elseif(WITH_ROCM) - hip_library( - phi ${PHI_BUILD_TYPE} - SRCS ${PHI_SRCS} - DEPS ${PHI_DEPS}) - -elseif(WITH_XPU_KP) - xpu_library( - phi ${PHI_BUILD_TYPE} - SRCS ${PHI_SRCS} - DEPS ${PHI_DEPS}) -else() - cc_library( - phi ${PHI_BUILD_TYPE} - SRCS ${PHI_SRCS} - DEPS ${PHI_DEPS}) -endif() + set(autogen_tmp_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp + ) + set(autogen_variable_tmp_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp + ) + set(autogen_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen + ) + set(autogen_variable_dir + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable + ) -target_compile_definitions(phi PUBLIC PHI_INNER) + file(GLOB generated_files ${autogen_tmp_dir}/*.h ${autogen_tmp_dir}/impl/*.cu) -if(WIN32) - target_link_libraries(phi shlwapi.lib) -endif() + file(GLOB variable_generated_files ${autogen_variable_tmp_dir}/*.h + ${autogen_variable_tmp_dir}/impl/*.cu) -if(WIN32) - if(WITH_SHARED_PHI) - set_property(TARGET phi PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON) - set(PHI_NAME - phi.dll - CACHE INTERNAL "" FORCE) + if(EXISTS ${autogen_dir}) + foreach(gen_file ${generated_files}) + string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${gen_file}" "${now_file}") + endforeach() + message("copy if different ${autogen_dir}") else() - set(PHI_NAME - phi.lib - CACHE INTERNAL "" FORCE) + foreach(gen_file ${generated_files}) + string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}" + "${now_file}") + endforeach() + message("copy ${autogen_dir}") endif() -elseif(APPLE) - if(WITH_SHARED_PHI) - set(PHI_NAME - libphi.dylib - CACHE INTERNAL "" FORCE) + + if(EXISTS ${autogen_variable_dir}) + foreach(gen_file ${variable_generated_files}) + string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file + ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${gen_file}" "${now_file}") + endforeach() + message("copy if different ${autogen_variable_dir}") else() - set(PHI_NAME - libphi.a - CACHE INTERNAL "" FORCE) + foreach(gen_file ${variable_generated_files}) + string(REPLACE "autogen_variable_tmp" "autogen_variable" now_file + ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}" + "${now_file}") + endforeach() + message("copy ${autogen_variable_dir}") endif() -else() - if(WITH_SHARED_PHI) - set(PHI_NAME - libphi.so - CACHE INTERNAL "" FORCE) + + file( + REMOVE_RECURSE + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_tmp + ) + file( + REMOVE_RECURSE + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen_variable_tmp + ) + + execute_process( + COMMAND + ${CMAKE_COMMAND} -E make_directory + "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp" + COMMAND ${PYTHON_EXECUTABLE} generic_mixed_gemm_kernelLauncher.py + --cuda_arch "${NVCC_ARCH_BIN}" + WORKING_DIRECTORY + "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm" + ) + set(fpA_intB_gemm_autogen_tmp_dir + ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp + ) + set(fpA_intB_gemm_autogen_dir + ${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen + ) + + file(GLOB fpA_intB_gemm_autogen_files ${fpA_intB_gemm_autogen_tmp_dir}/*.h + ${fpA_intB_gemm_autogen_tmp_dir}/*.cu) + + if(EXISTS ${fpA_intB_gemm_autogen_dir}) + foreach(gen_file ${fpA_intB_gemm_autogen_files}) + string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${gen_file}" "${now_file}") + endforeach() + message("copy if different ${fpA_intB_gemm_autogen_dir}") else() - set(PHI_NAME - libphi.a - CACHE INTERNAL "" FORCE) + foreach(gen_file ${fpA_intB_gemm_autogen_files}) + string(REPLACE "autogen_tmp" "autogen" now_file ${gen_file}) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}" + "${now_file}") + endforeach() + message("copy ${fpA_intB_gemm_autogen_dir}") endif() + + file( + GLOB cutlass_cu + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "fusion/cutlass/*.cu" + "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu" + "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu" + "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu" + "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu") + + list(APPEND kernel_cu ${cutlass_cu}) endif() -set(PHI_LIB - "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}" - CACHE FILEPATH "PHI Library" FORCE) +if(NOT WITH_CUDNN_FRONTEND) + list( + REMOVE_ITEM + kernel_cu + "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu" + "fusion/gpu/fused_scale_bias_add_relu_kernel.cu" + "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu" + "fusion/gpu/fused_dot_product_attention_op.cu" + "fusion/gpu/max_pool2d_v2_grad_kernel.cu" + "fusion/gpu/max_pool2d_v2_kernel.cu") +endif() -if(MKL_FOUND AND WITH_ONEMKL) - target_include_directories(phi PRIVATE ${MKL_INCLUDE}) +# Note(qili93): remove kernels not supported on DCU yet +if(WITH_ROCM) + list( + REMOVE_ITEM + kernel_cu + "gpu/affine_grid_grad_kernel.cu" + "gpu/apply_per_channel_scale_kernel.cu" + "gpu/cholesky_solve_kernel.cu" + "gpu/eigh_kernel.cu" + "gpu/eigvalsh_kernel.cu" + "gpu/lstsq_kernel.cu" + "gpu/lu_kernel.cu" + "gpu/matrix_rank_kernel.cu" + "gpu/matrix_rank_tol_kernel.cu" + "gpu/put_along_axis_grad_kernel.cu" + "gpu/put_along_axis_kernel.cu" + "gpu/qr_kernel.cu" + "gpu/svd_kernel.cu" + "gpudnn/mha_cudnn_frontend.cu" + "fusion/gpu/block_multi_head_attention_kernel.cu" + "fusion/gpu/fused_bn_add_activation_grad_kernel.cu" + "fusion/gpu/fused_bn_add_activation_kernel.cu" + "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu") endif() -add_dependencies(phi extern_lapack) -if(WITH_CUTLASS) - add_dependencies(phi cutlass_codegen) - add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION" - )# for memory_efficient_attention.h +set(cc_search_pattern + "*.cc" + "cpu/*.cc" + "legacy/*.cc" + "legacy/cpu/*.cc" + "selected_rows/*.cc" + "selected_rows/cpu/*.cc" + "sparse/*.cc" + "sparse/cpu/*.cc" + "legacy/*.cc" + "legacy/cpu/*.cc" + "strings/*.cc" + "strings/cpu/*.cc" + "fusion/*.cc" + "stride/*.cc" + "fusion/cpu/*.cc") + +if(WITH_MKLDNN) + set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc" + "fusion/onednn/*.cc") +endif() + +if(WITH_CUSTOM_DEVICE) + set(cc_search_pattern ${cc_search_pattern} "custom/*.cc") endif() -if(WITH_FLASHATTN) - add_dependencies(phi flashattn) + +file( + GLOB kernel_cc + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + ${cc_search_pattern}) + +if(DEFINED REDUCE_INFERENCE_LIB_SIZE) + list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$") endif() -# for inference static library -if(NOT WITH_SHARED_PHI) - get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) - set(phi_modules ${phi_modules} ${INFERENCE_DEPS} phi) - set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}") +if(NOT + (WITH_AVX + AND AVX512F_FOUND + AND AVX512F_FLAG + AND WITH_MKL)) + list(REMOVE_ITEM kernel_cc "fusion/cpu/self_dp_attention_kernel.cc") endif() -set(phi_extension_header_file - ${CMAKE_CURRENT_SOURCE_DIR}/extension.h - CACHE INTERNAL "phi/extension.h file") file( - WRITE ${phi_extension_header_file} - "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n" -) + GLOB kernel_xpu + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" + "sparse/xpu/*.cc") + +if(WITH_GPU OR WITH_ROCM) + collect_srcs(kernels_srcs SRCS ${kernel_cu}) + kernel_declare("${kernel_cu}") +endif() + +if(WITH_XPU) + if(WITH_XPU_KP) + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/ + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/) + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/ + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/) + file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu") + foreach(kernel ${kernel_xpu_kps}) + get_filename_component(name ${kernel} NAME_WE) + file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps") + endforeach() + file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps") + collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_kps}) -file(APPEND ${phi_extension_header_file} "#include \"paddle/phi/config.h\"\n\n") -# generate inner headers include dir for users -generate_unify_header(backends EXCLUDES context_pool_utils.h) -generate_unify_header(core EXCLUDES cuda_stream.h) -generate_unify_header(infermeta) -generate_unify_header(kernels SKIP_SUFFIX grad_kernel) + foreach(kernel ${kernel_cc}) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${kernel} + ${CMAKE_CURRENT_BINARY_DIR}/${kernel} COPYONLY) + endforeach() + file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc") + collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_cc}) + set(kernel_cc "") + + endif() + collect_srcs(kernels_srcs SRCS ${kernel_xpu}) + kernel_declare("${kernel_xpu}") + kernel_declare("${kernel_xpu_kps}") + kernel_declare("${kernel_xpu_cc}") +endif() + +collect_srcs(kernels_srcs SRCS ${kernel_cc}) +kernel_declare("${kernel_cc}") + +if(NOT "${KERNEL_LIST}" STREQUAL "") + prune_declaration_h() +endif() diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index ad30da4ddcd6f0..03da0544500920 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -355,7 +355,7 @@ namespace phi { "`"); \ } \ }() -#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_XPU) #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h index 6a82875819161b..3eee52efcbebe6 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h @@ -166,14 +166,14 @@ __inline__ __device__ double rsqrt_(const double val) { return ::rsqrt(val); } -#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) || defined(PADDLE_WITH_HIP) template <> __inline__ __device__ half rsqrt_(const half val) { return hrsqrt(val); } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template 1) { if (lane == 0) { @@ -290,7 +294,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( #pragma unroll for (int it = 1; it < THREADS_PER_WARP; it *= 2) { +#ifdef PADDLE_WITH_HIP + var_local += __shfl_xor(var_local, it); +#else var_local += __shfl_xor_sync(uint32_t(-1), var_local, it); +#endif } if (WARPS_N > 1) { @@ -546,7 +554,7 @@ __inline__ __device__ void cuLoadAddStridedInputs(const int64_t i1_block, } } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template 0; it /= 2) { +#ifdef PADDLE_WITH_HIP + sum_loss1 += __shfl_down(sum_loss1, it); + sum_loss2 += __shfl_down(sum_loss2, it); +#else sum_loss1 += __shfl_down_sync(uint32_t(-1), sum_loss1, it); sum_loss2 += __shfl_down_sync(uint32_t(-1), sum_loss2, it); +#endif } if (lane == 0) { diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu index 60a82cfe7c1980..48819c12a8dc0e 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu @@ -11,7 +11,12 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#ifndef PADDLE_WITH_HIP +#ifdef PADDLE_WITH_HIP +#include +#include +#include +namespace cub = hipcub; +#else #include #include #endif @@ -21,9 +26,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h" -#ifndef PADDLE_WITH_HIP #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" -#endif namespace phi { namespace fusion { @@ -51,7 +54,6 @@ void FusedBiasDropoutResidualLnGradKernel( DenseTensor* bias_grad, DenseTensor* ln_scale_grad, DenseTensor* ln_bias_grad) { -#ifndef PADDLE_WITH_HIP using U = LayerNormParamType; auto* d_y_data = y_grad.data(); auto* ln_scale_data = @@ -114,15 +116,19 @@ void FusedBiasDropoutResidualLnGradKernel( d_x_data, d_bias_data, d_residual_data); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "FusedBiasDropoutResidualLnGradKernel not surpport for rocm")); -#endif } } // namespace fusion } // namespace phi +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::fusion::FusedBiasDropoutResidualLnGradKernel, + float, + phi::dtype::float16) {} +#else PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad, GPU, ALL_LAYOUT, @@ -130,3 +136,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm_grad, float, double, phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu index 37450d3a4e178b..ca0bcbe7f2466a 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu @@ -17,9 +17,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h" -#ifndef PADDLE_WITH_HIP #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" -#endif namespace phi { namespace fusion { @@ -42,7 +40,6 @@ void FusedBiasDropoutResidualLnKernel( DenseTensor* dropout_mask_out, DenseTensor* ln_mean, DenseTensor* ln_variance) { -#ifndef PADDLE_WITH_HIP using U = phi::funcs::LayerNormParamType; auto* x_data = x.data(); auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data(); @@ -95,14 +92,20 @@ void FusedBiasDropoutResidualLnKernel( y_data, ln_mean_data, ln_var_data); -#else - PADDLE_THROW(phi::errors::Unimplemented( - "FusedBiasDropoutResidualLnKernel not support for rocm")); -#endif } } // namespace fusion } // namespace phi +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm, + GPU, + ALL_LAYOUT, + phi::fusion::FusedBiasDropoutResidualLnKernel, + float, + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); +} +#else PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm, GPU, ALL_LAYOUT, @@ -112,3 +115,4 @@ PD_REGISTER_KERNEL(fused_bias_dropout_residual_layer_norm, phi::dtype::float16) { kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); } +#endif diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h index e5f5c9ba50ba45..1db2d0134f80a9 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h @@ -35,7 +35,11 @@ struct GeluFunctor { template struct FastGeluFunctor { inline __device__ T operator()(const T x) const { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE(0, "FastGelu not surpport for rocm"); +#else return phi::GeluFwd(x); +#endif } }; @@ -92,8 +96,8 @@ __global__ void FusedDropoutActBias( int row_id = blockIdx.y; int idx = row_id * cols + col_id; - curandStatePhilox4_32_10_t state; - curand_init(seed, idx, increment, &state); + GPURAND(StatePhilox4_32_10_t) state; + GPURAND(_init)(seed, idx, increment, &state); const T factor = phi::fusion::GetFactor(dropout_prob, is_upscale_in_train, is_test); diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h index 2ef46378b1b9bd..ef9ecbb435fdba 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h @@ -20,10 +20,25 @@ limitations under the License. */ #include #endif +#ifdef PADDLE_WITH_HIP +#include +#include +#include +#include +#endif + #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h" +#ifdef PADDLE_WITH_HIP +#define GPU(str) hip##str +#define GPURAND(str) hiprand##str +#else +#define GPU(str) cuda##str +#define GPURAND(str) curand##str +#endif + namespace phi { namespace fusion { @@ -63,26 +78,29 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids( } template -__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state, +__forceinline__ __device__ void RandVec(GPURAND(StatePhilox4_32_10_t) * state, float *data); template <> -__forceinline__ __device__ void RandVec<1>(curandStatePhilox4_32_10_t *state, +__forceinline__ __device__ void RandVec<1>(GPURAND(StatePhilox4_32_10_t) * + state, float *data) { - data[0] = curand_uniform(state); + data[0] = GPURAND(_uniform)(state); } template <> -__forceinline__ __device__ void RandVec<2>(curandStatePhilox4_32_10_t *state, +__forceinline__ __device__ void RandVec<2>(GPURAND(StatePhilox4_32_10_t) * + state, float *data) { - data[0] = curand_uniform(state); - data[1] = curand_uniform(state); + data[0] = GPURAND(_uniform)(state); + data[1] = GPURAND(_uniform)(state); } template <> -__forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state, +__forceinline__ __device__ void RandVec<4>(GPURAND(StatePhilox4_32_10_t) * + state, float *data) { - float4 rand4 = curand_uniform4(state); + float4 rand4 = GPURAND(_uniform4)(state); data[0] = rand4.x; data[1] = rand4.y; data[2] = rand4.w; @@ -90,7 +108,8 @@ __forceinline__ __device__ void RandVec<4>(curandStatePhilox4_32_10_t *state, } template <> -__forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state, +__forceinline__ __device__ void RandVec<8>(GPURAND(StatePhilox4_32_10_t) * + state, float *data) { RandVec<4>(state, data); RandVec<4>(state, data + 4); @@ -99,7 +118,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state, template inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) { PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream())); + GPU(MemsetAsync)(ptr, 0, size * sizeof(T), ctx.stream())); } /** diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu index e31b24e7e105e5..221019531a5486 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu @@ -38,10 +38,19 @@ limitations under the License. #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" -#ifndef PADDLE_WITH_HIP -#include #include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h" #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" +#ifdef PADDLE_WITH_HIP +#include +#include +#include +namespace cub = hipcub; +#define GPU(str) hip##str +#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount +#else +#include +#define GPU(str) cuda##str +#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount #endif namespace phi { @@ -50,9 +59,11 @@ namespace fusion { namespace { -#ifndef PADDLE_WITH_HIP - +#ifdef PADDLE_WITH_HIP +constexpr int kWarpSize = 64; +#else constexpr int kWarpSize = 32; +#endif template struct SumOp { @@ -74,7 +85,11 @@ template