Skip to content

Commit bcd40f2

Browse files
authored
relu supports bfloat16 data type (#32542)
1 parent b5882c6 commit bcd40f2

File tree

4 files changed

+147
-10
lines changed

4 files changed

+147
-10
lines changed

paddle/fluid/operators/activation_op.cu

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ limitations under the License. */
1313
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
1414
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
1515
#include "paddle/fluid/operators/math/math_cuda_utils.h"
16+
#include "paddle/fluid/platform/bfloat16.h"
1617
#include "paddle/fluid/platform/cuda_device_function.h"
1718

1819
namespace paddle {
@@ -1437,9 +1438,9 @@ REGISTER_OP_CUDA_KERNEL(
14371438
/* ========================================================================== */
14381439

14391440
/* =========================== relu register ============================ */
1441+
#ifdef PADDLE_WITH_HIP
14401442
REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
14411443
CudaReluGradFunctor);
1442-
14431444
REGISTER_OP_CUDA_KERNEL(
14441445
relu_grad_grad,
14451446
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -1448,6 +1449,36 @@ REGISTER_OP_CUDA_KERNEL(
14481449
ops::ReluGradGradFunctor<double>>,
14491450
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
14501451
ops::ReluGradGradFunctor<plat::float16>>);
1452+
#else
1453+
REGISTER_OP_CUDA_KERNEL(
1454+
relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
1455+
ops::CudaReluFunctor<float>>,
1456+
ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
1457+
ops::CudaReluFunctor<double>>,
1458+
ops::ActivationCudaKernel<plat::CUDADeviceContext,
1459+
ops::CudaReluFunctor<plat::float16>>,
1460+
ops::ActivationCudaKernel<plat::CUDADeviceContext,
1461+
ops::CudaReluFunctor<plat::bfloat16>>);
1462+
REGISTER_OP_CUDA_KERNEL(
1463+
relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
1464+
ops::CudaReluGradFunctor<float>>,
1465+
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
1466+
ops::CudaReluGradFunctor<double>>,
1467+
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
1468+
ops::CudaReluGradFunctor<plat::float16>>,
1469+
ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
1470+
ops::CudaReluGradFunctor<plat::bfloat16>>);
1471+
REGISTER_OP_CUDA_KERNEL(
1472+
relu_grad_grad,
1473+
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
1474+
ops::ReluGradGradFunctor<float>>,
1475+
ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
1476+
ops::ReluGradGradFunctor<double>>,
1477+
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
1478+
ops::ReluGradGradFunctor<plat::float16>>,
1479+
ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
1480+
ops::ReluGradGradFunctor<plat::bfloat16>>);
1481+
#endif
14511482
/* ========================================================================== */
14521483

14531484
/* =========================== tanh register ============================ */

paddle/fluid/operators/cast_op.cu

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
9595

9696
namespace ops = paddle::operators;
9797

98+
#ifdef PADDLE_WITH_HIP
9899
REGISTER_OP_CUDA_KERNEL(
99100
cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
100101
ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
@@ -108,3 +109,20 @@ REGISTER_OP_CUDA_KERNEL(
108109
paddle::platform::complex64>,
109110
ops::CastOpKernel<paddle::platform::CUDADeviceContext,
110111
paddle::platform::complex128>);
112+
#else
113+
REGISTER_OP_CUDA_KERNEL(
114+
cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
115+
ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
116+
ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
117+
ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
118+
ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
119+
ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
120+
ops::CastOpKernel<paddle::platform::CUDADeviceContext,
121+
paddle::platform::float16>,
122+
ops::CastOpKernel<paddle::platform::CUDADeviceContext,
123+
paddle::platform::bfloat16>,
124+
ops::CastOpKernel<paddle::platform::CUDADeviceContext,
125+
paddle::platform::complex64>,
126+
ops::CastOpKernel<paddle::platform::CUDADeviceContext,
127+
paddle::platform::complex128>);
128+
#endif

python/paddle/fluid/tests/unittests/op_test.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ def product(dim):
132132
tensor_to_check_dtype = np.float16
133133
# set delta as np.float16, will automatic convert to float32, float64
134134
delta = np.array(delta).astype(np.float16)
135+
elif tensor_to_check_dtype == core.VarDesc.VarType.BF16:
136+
tensor_to_check_dtype = np.float32
135137
else:
136138
raise ValueError("Not supported data type " + str(
137139
tensor_to_check_dtype))
@@ -140,9 +142,10 @@ def get_output():
140142
sum = []
141143
op.run(scope, place)
142144
for output_name in output_names:
143-
sum.append(
144-
np.array(scope.find_var(output_name).get_tensor()).astype(
145-
tensor_to_check_dtype).mean())
145+
output_numpy = np.array(scope.find_var(output_name).get_tensor())
146+
if tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
147+
output_numpy = convert_uint16_to_float(output_numpy)
148+
sum.append(output_numpy.astype(tensor_to_check_dtype).mean())
146149
return tensor_to_check_dtype(np.array(sum).sum() / len(output_names))
147150

148151
gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
@@ -152,6 +155,11 @@ def __get_elem__(tensor, i):
152155
numpy_tensor = np.array(tensor).astype(np.float16)
153156
numpy_tensor = numpy_tensor.flatten()
154157
return numpy_tensor[i]
158+
elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
159+
numpy_tensor = np.array(tensor).astype(np.uint16)
160+
numpy_tensor = numpy_tensor.flatten()
161+
return struct.unpack('<f', struct.pack('<I', numpy_tensor[i]
162+
<< 16))[0]
155163
elif tensor_to_check_dtype == np.float32:
156164
return tensor._get_float_element(i)
157165
elif tensor_to_check_dtype == np.float64:
@@ -168,6 +176,13 @@ def __set_elem__(tensor, i, e):
168176
numpy_tensor[i] = e
169177
numpy_tensor = numpy_tensor.reshape(shape)
170178
tensor.set(numpy_tensor, place)
179+
elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
180+
numpy_tensor = np.array(tensor).astype(np.uint16)
181+
shape = numpy_tensor.shape
182+
numpy_tensor = numpy_tensor.flatten()
183+
numpy_tensor[i] = np.uint16(copy_bits_from_float_to_uint16(e))
184+
numpy_tensor = numpy_tensor.reshape(shape)
185+
tensor.set(numpy_tensor, place)
171186
elif tensor_to_check_dtype == np.float32:
172187
tensor._set_float_element(i, e)
173188
elif tensor_to_check_dtype == np.float64:
@@ -1353,6 +1368,8 @@ def _assert_is_close(self, numeric_grads, analytic_grads, names,
13531368
abs_a[abs_a < 1e-10] = 1e-3
13541369
abs_a[np.logical_and(abs_a > 1e-10, abs_a <= 1e-8)] *= 1e4
13551370
abs_a[np.logical_and(abs_a > 1e-8, abs_a <= 1e-6)] *= 1e2
1371+
elif self.is_bfloat16_op():
1372+
abs_a[abs_a < 1e-2] = 1
13561373
else:
13571374
abs_a[abs_a < 1e-3] = 1
13581375

@@ -1500,6 +1517,13 @@ def check_grad_with_place(self,
15001517
dygraph_grad = self._get_dygraph_grad(
15011518
inputs_to_check, place, output_names, user_defined_grad_outputs,
15021519
no_grad_set)
1520+
fp32_grads = []
1521+
for grad in dygraph_grad:
1522+
if grad.dtype == np.uint16:
1523+
grad = convert_uint16_to_float(grad)
1524+
max_relative_error = 0.03
1525+
fp32_grads.append(grad)
1526+
dygraph_grad = fp32_grads
15031527
self._assert_is_close(numeric_grads, dygraph_grad, inputs_to_check,
15041528
max_relative_error,
15051529
"Gradient Check On %s" % str(place))
@@ -1544,6 +1568,21 @@ def _get_dygraph_grad(self,
15441568
outputs=outputs,
15451569
attrs=attrs_outputs if hasattr(self, "attrs") else None)
15461570

1571+
if self.dtype == np.uint16:
1572+
cast_inputs = self._find_var_in_dygraph(outputs,
1573+
output_names[0])
1574+
cast_outputs = block.create_var(
1575+
dtype="float32", shape=cast_inputs[0].shape)
1576+
cast_op = block.append_op(
1577+
inputs={"X": cast_inputs},
1578+
outputs={"Out": cast_outputs},
1579+
type="cast",
1580+
attrs={
1581+
"in_dtype": core.VarDesc.VarType.BF16,
1582+
"out_dtype": core.VarDesc.VarType.FP32
1583+
})
1584+
outputs = {output_names[0]: cast_outputs}
1585+
15471586
outputs_valid = {}
15481587
for output_name in output_names:
15491588
outputs_valid[output_name] = self._find_var_in_dygraph(
@@ -1659,6 +1698,21 @@ def _get_gradient(self,
16591698
feed_dict = self.feed_var(inputs, place)
16601699

16611700
if user_defined_grad_outputs is None:
1701+
if self.dtype == np.uint16:
1702+
cast_inputs = list(map(block.var, output_names))
1703+
cast_outputs = block.create_var(
1704+
dtype="float32", shape=cast_inputs[0].shape)
1705+
cast_op = block.append_op(
1706+
inputs={"X": cast_inputs},
1707+
outputs={"Out": cast_outputs},
1708+
type="cast",
1709+
attrs={
1710+
"in_dtype": core.VarDesc.VarType.BF16,
1711+
"out_dtype": core.VarDesc.VarType.FP32
1712+
})
1713+
cast_op.desc.infer_var_type(block.desc)
1714+
cast_op.desc.infer_shape(block.desc)
1715+
output_names = [cast_outputs.name]
16621716
loss = append_loss_ops(block, output_names)
16631717
param_grad_list = append_backward(
16641718
loss=loss,

python/paddle/fluid/tests/unittests/test_activation_op.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import numpy as np
1919
from scipy.special import expit, erf
2020

21-
from op_test import OpTest
21+
from op_test import OpTest, convert_float_to_uint16
2222
import paddle
2323
import paddle.nn as nn
2424
import paddle.nn.functional as F
@@ -1103,12 +1103,19 @@ def setUp(self):
11031103
self.init_dtype()
11041104

11051105
np.random.seed(1024)
1106-
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
1107-
# The same reason with TestAbs
1108-
x[np.abs(x) < 0.005] = 0.02
1109-
out = np.maximum(x, 0)
1106+
if self.dtype == np.uint16:
1107+
x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
1108+
# The same reason with TestAbs
1109+
x[np.abs(x) < 0.005] = 0.02
1110+
out = convert_float_to_uint16(np.maximum(x, 0))
1111+
self.inputs = {'X': convert_float_to_uint16(x)}
1112+
else:
1113+
x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
1114+
# The same reason with TestAbs
1115+
x[np.abs(x) < 0.005] = 0.02
1116+
out = np.maximum(x, 0)
1117+
self.inputs = {'X': x}
11101118

1111-
self.inputs = {'X': x}
11121119
self.outputs = {'Out': out}
11131120

11141121
def test_check_grad(self):
@@ -2739,5 +2746,32 @@ def test_check_grad(self):
27392746
create_test_act_fp16_class(TestSwish, grad_atol=0.85)
27402747
create_test_act_fp16_class(TestHardSwish)
27412748

2749+
2750+
def create_test_act_bf16_class(parent,
2751+
atol=1e-2,
2752+
grad_check=True,
2753+
grad_atol=0.80):
2754+
@unittest.skipIf(not paddle.is_compiled_with_cuda(),
2755+
"core is not compiled with CUDA")
2756+
class TestActBF16(parent):
2757+
def init_dtype(self):
2758+
self.dtype = np.uint16
2759+
2760+
def test_check_output(self):
2761+
place = core.CUDAPlace(0)
2762+
self.check_output_with_place(place, atol=atol)
2763+
2764+
def test_check_grad(self):
2765+
place = core.CUDAPlace(0)
2766+
self.check_grad_with_place(
2767+
place, ['X'], 'Out', max_relative_error=grad_atol)
2768+
2769+
cls_name = "{0}_{1}".format(parent.__name__, "bf16")
2770+
TestActBF16.__name__ = cls_name
2771+
globals()[cls_name] = TestActBF16
2772+
2773+
2774+
create_test_act_bf16_class(TestRelu)
2775+
27422776
if __name__ == "__main__":
27432777
unittest.main()

0 commit comments

Comments
 (0)