Skip to content

Commit 13698a6

Browse files
committed
fix multiple errors with inline syntax
1 parent 85d954c commit 13698a6

File tree

4 files changed

+31
-58
lines changed

4 files changed

+31
-58
lines changed

paddle/fluid/operators/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ if (WITH_GPU)
5656
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
5757
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
5858
endif()
59-
set(OP_HEADER_DEPS ${OP_HEADER_DEPS} elementwise_op)
6059
endif()
6160

6261

paddle/fluid/operators/elementwise/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ if(WITH_UNITY_BUILD)
33
# Load Unity Build rules for operators in paddle/fluid/operators/elementwise.
44
include(unity_build_rule.cmake)
55
endif()
6-
nv_library(elementwise_op SRCS elementwise_op_impl.cu)
7-
register_operators(DEPS op_version_registry elementwise_op)
6+
register_operators(DEPS op_version_registry)
87

98
cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
109
cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)

paddle/fluid/operators/elementwise/elementwise_op_impl.cu

Lines changed: 0 additions & 53 deletions
This file was deleted.

paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,43 @@ limitations under the License. */
1414
#pragma once
1515

1616
#include "paddle/fluid/framework/tensor.h"
17+
#include "paddle/fluid/platform/cuda_device_function.h"
1718
#include "paddle/fluid/platform/fast_divmod.h"
1819

20+
#ifdef __HIPCC__
21+
#define ELEMENTWISE_BLOCK_SIZE 256
22+
#else
23+
#define ELEMENTWISE_BLOCK_SIZE 512
24+
#endif
25+
1926
namespace paddle {
2027
namespace operators {
2128

2229
enum ElementwiseType { kUnary = 1, kBinary = 2 };
2330

24-
int GetThreadsConfig(const platform::CUDADeviceContext &ctx, int64_t numel,
25-
int vec_size);
31+
/*
32+
* According to NVIDIA, if number of threads per block is 64/128/256/512,
33+
* cuda performs better. And number of blocks should be greater (at least
34+
* 2x~4x) than number of SMs. Hence, SM count is took into account within
35+
* this function to determine the right number of threads per block.
36+
*/
37+
inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx,
38+
int64_t numel, int vec_size) {
39+
int threads = ELEMENTWISE_BLOCK_SIZE;
40+
int sm_count = ctx.GetSMCount();
41+
int active_threads_num = numel / vec_size;
42+
if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
43+
// Round up threads number into an exponential multiple of 2, while number
44+
// of acitve blocks is about twice of SM, to acquire better performance.
45+
threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
46+
} else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
47+
// Round up threads number into an exponential multiple of 2, while number
48+
// of acitve blocks is about 4 times of SM, to acquire better performance.
49+
threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
50+
}
51+
// Number of threads per block shall be larger than 64.
52+
return std::max(64, threads);
53+
}
2654

2755
/*
2856
* Only the address of input data is the multiplier of 1,2,4, vectorized load

0 commit comments

Comments
 (0)