@@ -14,7 +14,7 @@ limitations under the License. */
1414#pragma once
1515
1616#include " paddle/fluid/framework/tensor.h"
17- #include " paddle/fluid/platform/device_context .h"
17+ #include " paddle/fluid/platform/cuda_device_function .h"
1818#include " paddle/fluid/platform/fast_divmod.h"
1919
2020#ifdef __HIPCC__
@@ -28,19 +28,62 @@ namespace operators {
2828
2929enum ElementwiseType { kUnary = 1 , kBinary = 2 };
3030
31+ /*
32+ * According to NVIDIA, if number of threads per block is 64/128/256/512,
33+ * cuda performs better. And number of blocks should be greater (at least
34+ * 2x~4x) than number of SMs. Hence, SM count is took into account within
35+ * this function to determine the right number of threads per block.
36+ */
37+ inline int GetThreadsConfig (const platform::CUDADeviceContext &ctx,
38+ int64_t numel, int vec_size) {
39+ int threads = ELEMENTWISE_BLOCK_SIZE;
40+ int sm_count = ctx.GetSMCount ();
41+ int active_threads_num = numel / vec_size;
42+ if (active_threads_num / (sm_count << 1 ) < ELEMENTWISE_BLOCK_SIZE) {
43+ // Round up threads number into an exponential multiple of 2, while number
44+ // of acitve blocks is about twice of SM, to acquire better performance.
45+ threads = platform::RoundToPowerOfTwo (active_threads_num / (sm_count << 1 ));
46+ } else if (active_threads_num / (sm_count << 2 ) < ELEMENTWISE_BLOCK_SIZE) {
47+ // Round up threads number into an exponential multiple of 2, while number
48+ // of acitve blocks is about 4 times of SM, to acquire better performance.
49+ threads = platform::RoundToPowerOfTwo (active_threads_num / (sm_count << 2 ));
50+ }
51+ // Number of threads per block shall be larger than 64.
52+ return std::max (64 , threads);
53+ }
54+
55+ /*
56+ * Only the address of input data is the multiplier of 1,2,4, vectorized load
57+ * with corresponding multiplier-value is possible. Moreover, the maximum length
58+ * of vectorized load is 128 bits once. Hence, valid length of vectorized load
59+ * shall be determined under both former constraints.
60+ */
3161template <typename T>
3262int GetVectorizedSizeImpl (const T *pointer) {
63+ constexpr int max_load_bits = 128 ;
64+ int valid_vec_size = max_load_bits / CHAR_BIT / sizeof (T);
3365 uint64_t address = reinterpret_cast <uint64_t >(pointer);
66+ constexpr int vec8 =
67+ std::alignment_of<CudaAlignedVector<T, 8 >>::value; // NOLINT
3468 constexpr int vec4 =
3569 std::alignment_of<CudaAlignedVector<T, 4 >>::value; // NOLINT
3670 constexpr int vec2 =
3771 std::alignment_of<CudaAlignedVector<T, 2 >>::value; // NOLINT
38- if (address % vec4 == 0 ) {
39- return 4 ;
72+ if (address % vec8 == 0 ) {
73+ /*
74+ * Currently, decide to deal with no more than 4 data once while adopting
75+ * vectorization load/store, if performance test shows that dealing with
76+ * 8 data once in vectorization load/store does get optimized, return code
77+ * below can be changed into " return std::min(8, valid_vec_size); " .
78+ */
79+ return std::min (4 , valid_vec_size);
80+ } else if (address % vec4 == 0 ) {
81+ return std::min (4 , valid_vec_size);
4082 } else if (address % vec2 == 0 ) {
41- return 2 ;
83+ return std::min (2 , valid_vec_size);
84+ } else {
85+ return 1 ;
4286 }
43- return 1 ;
4487}
4588
4689template <typename InT, typename OutT>
@@ -96,42 +139,38 @@ struct ElementwiseDataWrapper {
96139
97140template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
98141 typename Functor>
99- __device__ void VectorizedKernelImpl (
142+ __device__ inline void VectorizedKernelImpl (
100143 ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
101144 int tid) {
102145 using InVecType = CudaAlignedVector<InT, VecSize>;
103146 using OutVecType = CudaAlignedVector<OutT, VecSize>;
104147 InVecType ins_vec[ET];
105148 OutVecType out_vec;
106149 InT *ins_ptr[ET];
107- OutT *out_ptr ;
150+ InT ins[ET] ;
108151#pragma unroll
109152 for (int i = 0 ; i < ET; ++i) {
110153 ins_ptr[i] = reinterpret_cast <InT *>(&(ins_vec[i]));
111154 }
112- out_ptr = reinterpret_cast <OutT *>(&out_vec);
113-
114155 // load
115156 data.load_vector (ins_vec, tid);
116157
117158// compute
118159#pragma unroll
119160 for (int i = 0 ; i < VecSize; ++i) {
120- InT ins[ET];
121161#pragma unroll
122162 for (int j = 0 ; j < ET; ++j) {
123163 ins[j] = ins_ptr[j][i];
124164 }
125- out_ptr [i] = func (ins);
165+ out_vec. val [i] = func (ins);
126166 }
127-
128167 // store
129168 data.store_vector (out_vec, tid);
130169}
131170
132171template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
133172 typename Functor>
134- __device__ void ScalarKernelImpl (
173+ __device__ inline void ScalarKernelImpl (
135174 ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
136175 int start, int remain) {
137176 InT ins[ET];
@@ -182,7 +221,7 @@ void LaunchSameDimsElementwiseCudaKernel(
182221 // calculate the max vec_size for all ins and outs
183222 auto size = ins[0 ]->numel ();
184223 int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
185- int block_size = ELEMENTWISE_BLOCK_SIZE ;
224+ int block_size = GetThreadsConfig (ctx, size, vec_size) ;
186225 int grid_size =
187226 ((size + vec_size - 1 ) / vec_size + block_size - 1 ) / block_size;
188227 const InT *in0 = ins[0 ]->data <InT>();
0 commit comments