@@ -14,15 +14,43 @@ limitations under the License. */
1414#pragma once
1515
1616#include " paddle/fluid/framework/tensor.h"
17+ #include " paddle/fluid/platform/cuda_device_function.h"
1718#include " paddle/fluid/platform/fast_divmod.h"
1819
20+ #ifdef __HIPCC__
21+ #define ELEMENTWISE_BLOCK_SIZE 256
22+ #else
23+ #define ELEMENTWISE_BLOCK_SIZE 512
24+ #endif
25+
1926namespace paddle {
2027namespace operators {
2128
2229enum ElementwiseType { kUnary = 1 , kBinary = 2 };
2330
24- int GetThreadsConfig (const platform::CUDADeviceContext &ctx, int64_t numel,
25- int vec_size);
31+ /*
32+ * According to NVIDIA, if number of threads per block is 64/128/256/512,
33+ * cuda performs better. And number of blocks should be greater (at least
34+ * 2x~4x) than number of SMs. Hence, SM count is took into account within
35+ * this function to determine the right number of threads per block.
36+ */
37+ inline int GetThreadsConfig (const platform::CUDADeviceContext &ctx,
38+ int64_t numel, int vec_size) {
39+ int threads = ELEMENTWISE_BLOCK_SIZE;
40+ int sm_count = ctx.GetSMCount ();
41+ int active_threads_num = numel / vec_size;
42+ if (active_threads_num / (sm_count << 1 ) < ELEMENTWISE_BLOCK_SIZE) {
43+ // Round up threads number into an exponential multiple of 2, while number
44+ // of acitve blocks is about twice of SM, to acquire better performance.
45+ threads = platform::RoundToPowerOfTwo (active_threads_num / (sm_count << 1 ));
46+ } else if (active_threads_num / (sm_count << 2 ) < ELEMENTWISE_BLOCK_SIZE) {
47+ // Round up threads number into an exponential multiple of 2, while number
48+ // of acitve blocks is about 4 times of SM, to acquire better performance.
49+ threads = platform::RoundToPowerOfTwo (active_threads_num / (sm_count << 2 ));
50+ }
51+ // Number of threads per block shall be larger than 64.
52+ return std::max (64 , threads);
53+ }
2654
2755/*
2856* Only the address of input data is the multiplier of 1,2,4, vectorized load
0 commit comments