fix multiple errors with inline syntax

JamesLim-sy · JamesLim-sy · commit 13698a6e1117 · 2021-06-11T03:08:06.000Z
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -56,7 +56,6 @@ if (WITH_GPU)
     if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
         SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
     endif()
-    set(OP_HEADER_DEPS ${OP_HEADER_DEPS} elementwise_op)
 endif()
 
 
diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt
@@ -3,8 +3,7 @@ if(WITH_UNITY_BUILD)
     # Load Unity Build rules for operators in paddle/fluid/operators/elementwise.
     include(unity_build_rule.cmake)
 endif()
-nv_library(elementwise_op SRCS elementwise_op_impl.cu)
-register_operators(DEPS op_version_registry elementwise_op)
+register_operators(DEPS op_version_registry)
 
 cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
 cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -14,15 +14,43 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
 namespace paddle {
 namespace operators {
 
 enum ElementwiseType { kUnary = 1, kBinary = 2 };
 
-int GetThreadsConfig(const platform::CUDADeviceContext &ctx, int64_t numel,
-                     int vec_size);
+/*
+* According to NVIDIA, if number of threads per block is 64/128/256/512,
+* cuda performs better. And number of blocks should be greater (at least
+* 2x~4x) than number of SMs. Hence, SM count is took into account within
+* this function to determine the right number of threads per block.
+*/
+inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx,
+                            int64_t numel, int vec_size) {
+  int threads = ELEMENTWISE_BLOCK_SIZE;
+  int sm_count = ctx.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  return std::max(64, threads);
+}
 
 /*
 * Only the address of input data is the multiplier of 1,2,4, vectorized load